In [ ]:
# Research Question :
#     "1-How does the number of reviews (volume of reviews) affect the hotel rating or score over time"
#     "2-Does accumulating more reviews push average scores up or down? And does this depend on other variables or factors
#     "like the number of negative and positive words,The total number of reviews this particular reviewer has posted,How
#     "many days have elapsed since the review was posted? and also is the effect of late reviews is highr than later 
#     "reviews"

# The other coulmn named 'tags' which contain the type of the travel for reviewers or the number of the people traveling 
# was used for hetergenous treatment effect. 

# I used cumulative approach for dependent and independent variables because the nature of the data is panel and 
# cumulative reviews tells us how much reputation has been built so far and reflects all historical reviews, not just 
# this week's or day's reviews. The week based cumulation was applied beacuse not to be too noisy like daily, not too 
# aggregated like monthly.

# Two models across all type of the tests were applied : 
    
#     1- Without Fixed Effects (Pooled OLS):  
        
#         Avg Score_ht=β0+β1.log(Cumulative Reviews_ht)+β_2.log(Positive Words_ht)​+⋯+ε_ht
        
#     2- with Fixed effects(hotel and time fixed effect):    
        
#         Avg Score_ht=α_h+λ_t+β1.log(Cumulative Reviews_ht)+β2.log(Positive Words_ht)+⋯+ε_ht
        
In [1]:
import os

file_path = "Hotel.review.zip"  

if os.path.exists(file_path):
    print("File exists, proceeding with extraction.")
else:
    print("File not found. Please check the file path.")
File exists, proceeding with extraction.
In [2]:
import zipfile

extract_path = "./Hotel_Review" 

with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction complete. Files:", os.listdir(extract_path))
Extraction complete. Files: ['Hotel_Reviews.csv']
In [3]:
import pandas as pd


extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)

csv_file = [f for f in extracted_files if f.endswith('.csv')][0] 
csv_path = os.path.join(extract_path, csv_file)

df = pd.read_csv(csv_path)
print("Dataset loaded successfully!")

df.head(100)
Extracted files: ['Hotel_Reviews.csv']
Dataset loaded successfully!
Out[3]:
Hotel_Address Additional_Number_of_Scoring Review_Date Average_Score Hotel_Name Reviewer_Nationality Negative_Review Review_Total_Negative_Word_Counts Total_Number_of_Reviews Positive_Review Review_Total_Positive_Word_Counts Total_Number_of_Reviews_Reviewer_Has_Given Reviewer_Score Tags days_since_review lat lng
0 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 8/3/2017 7.7 Hotel Arena Russia I am so angry that i made this post available... 397 1403 Only the park outside of the hotel was beauti... 11 7 2.9 [' Leisure trip ', ' Couple ', ' Duplex Double... 0 days 52.360576 4.915968
1 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 8/3/2017 7.7 Hotel Arena Ireland No Negative 0 1403 No real complaints the hotel was great great ... 105 7 7.5 [' Leisure trip ', ' Couple ', ' Duplex Double... 0 days 52.360576 4.915968
2 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 7/31/2017 7.7 Hotel Arena Australia Rooms are nice but for elderly a bit difficul... 42 1403 Location was good and staff were ok It is cut... 21 9 7.1 [' Leisure trip ', ' Family with young childre... 3 days 52.360576 4.915968
3 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 7/31/2017 7.7 Hotel Arena United Kingdom My room was dirty and I was afraid to walk ba... 210 1403 Great location in nice surroundings the bar a... 26 1 3.8 [' Leisure trip ', ' Solo traveler ', ' Duplex... 3 days 52.360576 4.915968
4 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 7/24/2017 7.7 Hotel Arena New Zealand You When I booked with your company on line y... 140 1403 Amazing location and building Romantic setting 8 3 6.7 [' Leisure trip ', ' Couple ', ' Suite ', ' St... 10 days 52.360576 4.915968
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 10/4/2016 7.7 Hotel Arena United Kingdom The work being done externally right now is o... 27 1403 Great room very comfortable bed Nice and quie... 97 1 9.6 [' Leisure trip ', ' Couple ', ' Duplex Double... 303 day 52.360576 4.915968
96 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 9/27/2016 7.7 Hotel Arena United Kingdom No Negative 0 1403 We upgraded to a larger room Had the bath inf... 38 1 10.0 [' Leisure trip ', ' Couple ', ' Suite ', ' St... 310 day 52.360576 4.915968
97 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 9/27/2016 7.7 Hotel Arena United Kingdom Our room was a little compact probably due to... 31 1403 A very quirky hotel that managed to keep its ... 29 2 8.8 [' Leisure trip ', ' Couple ', ' Duplex Double... 310 day 52.360576 4.915968
98 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 9/19/2016 7.7 Hotel Arena United Kingdom Got charged 50 for a birthday package when it... 112 1403 No Positive 0 1 5.0 [' Leisure trip ', ' Couple ', ' Large King Ro... 318 day 52.360576 4.915968
99 s Gravesandestraat 55 Oost 1092 AA Amsterdam ... 194 9/19/2016 7.7 Hotel Arena Ireland split level bedrooms although quirky are impr... 45 1403 beautiful surroundings great location Old wor... 14 10 6.3 [' Business trip ', ' Solo traveler ', ' Duple... 318 day 52.360576 4.915968

100 rows × 17 columns

In [ ]:
# 1- Hotel_Address: The street address and city/country of the hotel.
# 2- Additional_Number_of_Scoring: Possibly the number of additional scores or ratings (beyond this particular review) that the hotel has received.
# 3- Review_Date: The date when the review was submitted.
# 4- Average_Score: The hotel’s overall average score at the time of the review.
# 5- Hotel_Name: The name of the hotel reviewed.
# 6- Reviewer_Nationality: The nationality of the person who wrote the review.
# 7- Negative_Review: The negative portion of the reviewer’s comment. Some reviewers only leave positive feedback, so “No Negative” or similar might appear if they did not include criticisms.
# 8- Review_Total_Negative_Word_Counts: A count of how many words appear in the negative comment section.
# 9- Total_Number_of_Reviews: The total number of reviews the hotel has received (from all guests).
# 10- Positive_Review: The positive portion of the reviewer’s comment. Similarly, some reviews might read “No Positive.”
# 11- Review_Total_Positive_Word_Counts: A count of how many words appear in the positive comment section.
# 12- Total_Number_of_Reviews_Reviewer_Has_Given: The total number of reviews this particular reviewer has posted (for all hotels, not just this one).
# 13- Reviewer_Score: The numerical score (rating) that the reviewer gave to the hotel for this specific review.
# 14- Tags: A list or set of descriptive tags about the trip context (e.g., “Leisure trip,” “Couple,” “Solo traveler,” room type, length of stay, etc.).
# 15- days_since_review: How many days have elapsed since the review was posted (relative to some reference date).
# 16- lat and lng: The latitude and longitude of the hotel’s location (some hotels may be missing these coordinates).
In [4]:
df.shape
Out[4]:
(515738, 17)
In [5]:
df.dtypes
Out[5]:
Hotel_Address                                  object
Additional_Number_of_Scoring                    int64
Review_Date                                    object
Average_Score                                 float64
Hotel_Name                                     object
Reviewer_Nationality                           object
Negative_Review                                object
Review_Total_Negative_Word_Counts               int64
Total_Number_of_Reviews                         int64
Positive_Review                                object
Review_Total_Positive_Word_Counts               int64
Total_Number_of_Reviews_Reviewer_Has_Given      int64
Reviewer_Score                                float64
Tags                                           object
days_since_review                              object
lat                                           float64
lng                                           float64
dtype: object
In [6]:
df.describe()
Out[6]:
Additional_Number_of_Scoring Average_Score Review_Total_Negative_Word_Counts Total_Number_of_Reviews Review_Total_Positive_Word_Counts Total_Number_of_Reviews_Reviewer_Has_Given Reviewer_Score lat lng
count 515738.000000 515738.000000 515738.000000 515738.000000 515738.000000 515738.000000 515738.000000 512470.000000 512470.000000
mean 498.081836 8.397487 18.539450 2743.743944 17.776458 7.166001 8.395077 49.442439 2.823803
std 500.538467 0.548048 29.690831 2317.464868 21.804185 11.040228 1.637856 3.466325 4.579425
min 1.000000 5.200000 0.000000 43.000000 0.000000 1.000000 2.500000 41.328376 -0.369758
25% 169.000000 8.100000 2.000000 1161.000000 5.000000 1.000000 7.500000 48.214662 -0.143372
50% 341.000000 8.400000 9.000000 2134.000000 11.000000 3.000000 8.800000 51.499981 0.010607
75% 660.000000 8.800000 23.000000 3613.000000 22.000000 8.000000 9.600000 51.516288 4.834443
max 2682.000000 9.800000 408.000000 16670.000000 395.000000 355.000000 10.000000 52.400181 16.429233
In [7]:
df.isna().sum()
Out[7]:
Hotel_Address                                    0
Additional_Number_of_Scoring                     0
Review_Date                                      0
Average_Score                                    0
Hotel_Name                                       0
Reviewer_Nationality                             0
Negative_Review                                  0
Review_Total_Negative_Word_Counts                0
Total_Number_of_Reviews                          0
Positive_Review                                  0
Review_Total_Positive_Word_Counts                0
Total_Number_of_Reviews_Reviewer_Has_Given       0
Reviewer_Score                                   0
Tags                                             0
days_since_review                                0
lat                                           3268
lng                                           3268
dtype: int64
In [101]:
df.duplicated()
Out[101]:
0         False
1         False
2         False
3         False
4         False
          ...  
515733    False
515734    False
515735    False
515736    False
515737    False
Length: 515738, dtype: bool
In [8]:
import matplotlib.pyplot as plt

# 1. Histogram for Reviewer Scores
fig1 = plt.figure(figsize=(8, 5))
plt.hist(df["Reviewer_Score"], bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Reviewer Score")
plt.ylabel("Frequency")
plt.title("Distribution of Reviewer Scores")
fig1.savefig("reviewer_scores_histogram.png", dpi=300, bbox_inches="tight")  # Save figure
plt.show()

# 2. Histogram for Average Hotel Score
fig2 = plt.figure(figsize=(8, 5))
plt.hist(df["Average_Score"], bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Average Hotel Score")
plt.ylabel("Frequency")
plt.title("Distribution of Average Hotel Scores")
fig2.savefig("average_hotel_scores_histogram.png", dpi=300, bbox_inches="tight")  # Save figure
plt.show()

# 3. Bar plot for Top 10 Reviewer Nationalities
top_nationalities = df["Reviewer_Nationality"].value_counts().head(10)
fig3 = plt.figure(figsize=(10, 5))
top_nationalities.plot(kind="bar", color="skyblue", edgecolor="black")
plt.xlabel("Nationality")
plt.ylabel("Number of Reviews")
plt.title("Top 10 Reviewer Nationalities")
plt.xticks(rotation=45)
fig3.savefig("top_reviewer_nationalities.png", dpi=300, bbox_inches="tight")  # Save figure
plt.show()
In [9]:
plt.figure(figsize=(8, 5))
plt.hist(df["Reviewer_Score"], bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Reviewer Score")
plt.ylabel("Frequency")
plt.title("Distribution of Reviewer Scores")

plt.savefig("reviewer_scores_histogram.png", dpi=300, bbox_inches="tight")  # Save the figure
plt.show()  
In [10]:
import matplotlib.pyplot as plt

#  histogram for Reviewer Scores
plt.figure(figsize=(8, 5))
plt.hist(df["Reviewer_Score"], bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Reviewer Score")
plt.ylabel("Frequency")
plt.title("Distribution of Reviewer Scores")
plt.show()

#  histogram for Average Hotel Score
plt.figure(figsize=(8, 5))
plt.hist(df["Average_Score"], bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Average Hotel Score")
plt.ylabel("Frequency")
plt.title("Distribution of Average Hotel Scores")
plt.show()

# Counting of reviews per country (Reviewer_Nationality)
top_nationalities = df["Reviewer_Nationality"].value_counts().head(10)
plt.figure(figsize=(10, 5))
top_nationalities.plot(kind="bar", color="skyblue", edgecolor="black")
plt.xlabel("Nationality")
plt.ylabel("Number of Reviews")
plt.title("Top 10 Reviewer Nationalities")
plt.xticks(rotation=45)
plt.show()
In [11]:
#  histogram for Review_Total_Negative_Word_Counts
plt.figure(figsize=(8, 5))
plt.hist(df["Review_Total_Negative_Word_Counts"], bins=50, edgecolor="black", alpha=0.7)
plt.xlabel("Total Negative Word Count")
plt.ylabel("Frequency")
plt.title("Distribution of Total Negative Word Counts in Reviews")
plt.show()

#  histogram for Review_Total_Positive_Word_Counts
plt.figure(figsize=(8, 5))
plt.hist(df["Review_Total_Positive_Word_Counts"], bins=50, edgecolor="black", alpha=0.7)
plt.xlabel("Total Positive Word Count")
plt.ylabel("Frequency")
plt.title("Distribution of Total Positive Word Counts in Reviews")
plt.show()

#  histogram for Additional_Number_of_Scoring
plt.figure(figsize=(8, 5))
plt.hist(df["Additional_Number_of_Scoring"], bins=50, edgecolor="black", alpha=0.7)
plt.xlabel("Additional Number of Scoring")
plt.ylabel("Frequency")
plt.title("Distribution of Additional Number of Scoring")
plt.show()
In [12]:
# Scatter plot of Average Score vs. Total Reviews of Hotels
plt.figure(figsize=(8, 5))
plt.scatter(df["Total_Number_of_Reviews"], df["Average_Score"], alpha=0.5)
plt.xlabel("Total Number of Reviews per Hotel")
plt.ylabel("Average Hotel Score")
plt.title("Total Reviews vs. Average Score")
plt.show()
In [13]:
import seaborn as sns

# Histogram for Additional_Number_of_Scoring
plt.figure(figsize=(8, 5))
sns.histplot(df["Additional_Number_of_Scoring"], bins=50, kde=True, color="blue")
plt.xlabel("Additional Number of Scoring")
plt.ylabel("Frequency")
plt.title("Distribution of Additional Number of Scoring")
plt.show()

# Boxplot for Reviewer_Score vs. Average_Score
plt.figure(figsize=(8, 5))
sns.boxplot(x=df["Reviewer_Score"], y=df["Average_Score"], palette="coolwarm")
plt.xlabel("Reviewer Score")
plt.ylabel("Average Score of Hotel")
plt.title("Reviewer Score vs. Hotel's Average Score")
plt.show()

# Scatter plot for Additional_Number_of_Scoring vs. Average_Score
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df["Additional_Number_of_Scoring"], y=df["Average_Score"], alpha=0.5, color="purple")
plt.xlabel("Additional Number of Scoring")
plt.ylabel("Average Score")
plt.title("Additional Number of Scoring vs. Average Hotel Score")
plt.show()

# Countinh plot for top 10 most reviewed hotels
plt.figure(figsize=(10, 5))
top_hotels = df["Hotel_Name"].value_counts().head(10)
sns.barplot(x=top_hotels.index, y=top_hotels.values, palette="viridis")
plt.xlabel("Hotel Name")
plt.ylabel("Number of Reviews")
plt.xticks(rotation=45)
plt.title("Top 10 Most Reviewed Hotels")
plt.show()

# Bar plot for top 10 reviewer nationalities
plt.figure(figsize=(10, 5))
top_nationalities = df["Reviewer_Nationality"].value_counts().head(10)
sns.barplot(x=top_nationalities.index, y=top_nationalities.values, palette="Blues_r")
plt.xlabel("Nationality")
plt.ylabel("Number of Reviews")
plt.xticks(rotation=45)
plt.title("Top 10 Reviewer Nationalities")
plt.show()
In [117]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Numeric Features")
plt.show()
In [14]:
# Extracting the country from the hotel address (last word of the address)
df['Country'] = df['Hotel_Address'].apply(lambda x: x.split()[-1])

# Counting unique hotels per country
hotels_per_country = df.groupby('Country')['Hotel_Name'].nunique()

# Counting total number of reviews per country
reviews_per_country = df['Country'].value_counts()

# merging into a single DataFrame
country_stats = pd.DataFrame({
    'Number_of_Hotels': hotels_per_country,
    'Number_of_Reviews': reviews_per_country
}).reset_index().rename(columns={'index': 'Country'})


country_stats.head(7)
Out[14]:
Country Number_of_Hotels Number_of_Reviews
0 Austria 158 38939
1 France 458 59928
2 Italy 162 37207
3 Kingdom 400 262301
4 Netherlands 105 57214
5 Spain 211 60149
In [15]:
# since the amount of the data is substantial, all data except U.K. was removed from the main data. 
# Filter the DataFrame for hotels located in the United Kingdom
#using use 'Hotel_Address' to identify UK hotels (addresses containing 'United Kingdom')

uk_hotels_df = df[df['Hotel_Address'].str.contains('United Kingdom', na=False)]
uk_hotels_df.head()
Out[15]:
Hotel_Address Additional_Number_of_Scoring Review_Date Average_Score Hotel_Name Reviewer_Nationality Negative_Review Review_Total_Negative_Word_Counts Total_Number_of_Reviews Positive_Review Review_Total_Positive_Word_Counts Total_Number_of_Reviews_Reviewer_Has_Given Reviewer_Score Tags days_since_review lat lng Country
405 1 15 Templeton Place Earl s Court Kensington a... 244 8/3/2017 8.5 K K Hotel George Ireland Really nothing negative to say about this hotel 9 1831 Very comfortable beds smart bathroom good sho... 18 2 9.6 [' Leisure trip ', ' Group ', ' Classic Twin R... 0 days 51.491888 -0.194971 Kingdom
406 1 15 Templeton Place Earl s Court Kensington a... 244 8/2/2017 8.5 K K Hotel George United States of America No coffee or tea in the room or lobby No one ... 52 1831 No Positive 0 1 7.1 [' Leisure trip ', ' Couple ', ' Classic Twin ... 1 days 51.491888 -0.194971 Kingdom
407 1 15 Templeton Place Earl s Court Kensington a... 244 7/20/2017 8.5 K K Hotel George United States of America Room was pretty small for the three of us 11 1831 The staff was very friendly and helpful in or... 27 3 9.2 [' Leisure trip ', ' Family with young childre... 14 days 51.491888 -0.194971 Kingdom
408 1 15 Templeton Place Earl s Court Kensington a... 244 7/19/2017 8.5 K K Hotel George Australia Nothing really 4 1831 We stayed at the hotel at a friend s recommen... 66 15 9.6 [' Leisure trip ', ' Couple ', ' Classic Doubl... 15 days 51.491888 -0.194971 Kingdom
409 1 15 Templeton Place Earl s Court Kensington a... 244 7/18/2017 8.5 K K Hotel George United States of America Only cab service to airport is expensive comp... 16 1831 Super location on Division underground line d... 60 1 8.3 [' Leisure trip ', ' Group ', ' Classic Twin R... 16 days 51.491888 -0.194971 Kingdom
In [24]:
uk_hotels_df.shape[0]
Out[24]:
262301
In [16]:
#  histogram for Reviewer Scores for UK. 
plt.figure(figsize=(8, 5))
plt.hist(uk_hotels_df["Reviewer_Score"], bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Reviewer Score")
plt.ylabel("Frequency")
plt.title("Distribution of Reviewer Scores")
plt.show()

#  histogram for Average Hotel Score
plt.figure(figsize=(8, 5))
plt.hist(uk_hotels_df["Average_Score"], bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Average Hotel Score")
plt.ylabel("Frequency")
plt.title("Distribution of Average Hotel Scores")
plt.show()

# Counting of reviews per country (Reviewer_Nationality)
top_nationalities = uk_hotels_df["Reviewer_Nationality"].value_counts().head(10)
plt.figure(figsize=(10, 5))
top_nationalities.plot(kind="bar", color="skyblue", edgecolor="black")
plt.xlabel("Nationality")
plt.ylabel("Number of Reviews")
plt.title("Top 10 Reviewer Nationalities")
plt.xticks(rotation=45)
plt.show()
In [19]:
# Counting of reviews per country (Reviewer_Nationality)
top_nationalities = uk_hotels_df["Reviewer_Nationality"].value_counts().head(10)
fig = plt.figure(figsize=(12, 8))  # Large figure size in inches
top_nationalities.plot(kind="bar", color="skyblue", edgecolor="black")
plt.xlabel("Nationality")
plt.ylabel("Number of Reviews")
plt.title("Top 10 Reviewer Nationalities")
plt.xticks(rotation=45)

fig.savefig("Natinality.png", dpi=600, bbox_inches='tight')

plt.show()
In [18]:
#  histogram for Average Hotel Score
fig = plt.figure(figsize=(12, 8))  
plt.hist(uk_hotels_df["Average_Score"], bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Average Hotel Score")
plt.ylabel("Frequency")
plt.title("Distribution of Average Hotel Scores")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

fig.savefig("Distribution of Average Hotel Scores.png", dpi=600, bbox_inches='tight')

plt.show()
In [118]:
fig = plt.figure(figsize=(12, 8)) 
plt.hist(uk_hotels_df["Reviewer_Score"], bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Reviewer Score", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.title("Distribution of Reviewer Scores", fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

fig.savefig("reviewer_score_histogram_hd.png", dpi=600, bbox_inches='tight')

plt.show()
In [15]:
#  histogram for Review_Total_Negative_Word_Counts
plt.figure(figsize=(8, 5))
plt.hist(uk_hotels_df["Review_Total_Negative_Word_Counts"], bins=50, edgecolor="black", alpha=0.7)
plt.xlabel("Total Negative Word Count")
plt.ylabel("Frequency")
plt.title("Distribution of Total Negative Word Counts in Reviews")
plt.show()

#  histogram for Review_Total_Positive_Word_Counts
plt.figure(figsize=(8, 5))
plt.hist(uk_hotels_df["Review_Total_Positive_Word_Counts"], bins=50, edgecolor="black", alpha=0.7)
plt.xlabel("Total Positive Word Count")
plt.ylabel("Frequency")
plt.title("Distribution of Total Positive Word Counts in Reviews")
plt.show()

#  histogram for Additional_Number_of_Scoring
plt.figure(figsize=(8, 5))
plt.hist(uk_hotels_df["Additional_Number_of_Scoring"], bins=50, edgecolor="black", alpha=0.7)
plt.xlabel("Additional Number of Scoring")
plt.ylabel("Frequency")
plt.title("Distribution of Additional Number of Scoring")
plt.show()
In [23]:
fig = plt.figure(figsize=(12, 8)) 
plt.scatter(uk_hotels_df["Total_Number_of_Reviews"], uk_hotels_df["Average_Score"], alpha=0.5)
plt.xlabel("Total Number of Reviews per Hotel")
plt.ylabel("Average Hotel Score")
plt.title("Total Reviews vs. Average Score (UK Hotels)")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

fig.savefig("Comp.png", bbox_inches="tight")

plt.show()
In [25]:
uk_hotels_summary = uk_hotels_df[['Hotel_Name', 'Reviewer_Score', 'Review_Date']]

uk_hotels_summary
Out[25]:
Hotel_Name Reviewer_Score Review_Date
405 K K Hotel George 9.6 8/3/2017
406 K K Hotel George 7.1 8/2/2017
407 K K Hotel George 9.2 7/20/2017
408 K K Hotel George 9.6 7/19/2017
409 K K Hotel George 8.3 7/18/2017
... ... ... ...
515408 Holiday Inn London Kensington 9.6 8/5/2015
515409 Holiday Inn London Kensington 10.0 8/4/2015
515410 Holiday Inn London Kensington 5.0 8/4/2015
515411 Holiday Inn London Kensington 8.8 8/4/2015
515412 Holiday Inn London Kensington 9.6 8/4/2015

262301 rows × 3 columns

In [28]:
uk_hotels_summary = uk_hotels_df[['Hotel_Name', 'Reviewer_Score', 'Review_Date']].copy()

# Converting 'Review_Date' to datetime format
uk_hotels_summary['Review_Date'] = pd.to_datetime(uk_hotels_summary['Review_Date'])

#  Creating 'Year_Week' column to represent the week of the year
uk_hotels_summary['Year_Week'] = uk_hotels_summary['Review_Date'].dt.strftime('%Y-%U')

# Grouping by 'Hotel_Name' and 'Year_Week' to compute weekly statistics
weekly_summary = uk_hotels_summary.groupby(['Hotel_Name', 'Year_Week']).agg(
    Reviews_This_Week=('Reviewer_Score', 'count'),
    Sum_Scores_This_Week=('Reviewer_Score', 'sum'),
    Avg_Score_This_Week=('Reviewer_Score', 'mean')
).reset_index()

#  Sorting values to prepare for cumulative calculations
weekly_summary = weekly_summary.sort_values(['Hotel_Name', 'Year_Week'])

#  Calculating cumulative statistics
weekly_summary['Cumulative_Reviews'] = weekly_summary.groupby('Hotel_Name')['Reviews_This_Week'].cumsum()
weekly_summary['Cumulative_Sum_Scores'] = weekly_summary.groupby('Hotel_Name')['Sum_Scores_This_Week'].cumsum()
weekly_summary['Cumulative_Avg_Score'] = (
    weekly_summary['Cumulative_Sum_Scores'] / weekly_summary['Cumulative_Reviews']
)


weekly_summary
Out[28]:
Hotel_Name Year_Week Reviews_This_Week Sum_Scores_This_Week Avg_Score_This_Week Cumulative_Reviews Cumulative_Sum_Scores Cumulative_Avg_Score
0 11 Cadogan Gardens 2015-32 1 10.0 10.000000 1 10.0 10.000000
1 11 Cadogan Gardens 2015-35 1 10.0 10.000000 2 20.0 10.000000
2 11 Cadogan Gardens 2015-36 2 13.8 6.900000 4 33.8 8.450000
3 11 Cadogan Gardens 2015-37 1 9.2 9.200000 5 43.0 8.600000
4 11 Cadogan Gardens 2015-38 2 17.9 8.950000 7 60.9 8.700000
... ... ... ... ... ... ... ... ...
36129 every hotel Piccadilly 2017-27 3 25.4 8.466667 553 4966.0 8.980108
36130 every hotel Piccadilly 2017-28 1 10.0 10.000000 554 4976.0 8.981949
36131 every hotel Piccadilly 2017-29 2 14.2 7.100000 556 4990.2 8.975180
36132 every hotel Piccadilly 2017-30 4 30.9 7.725000 560 5021.1 8.966250
36133 every hotel Piccadilly 2017-31 8 72.6 9.075000 568 5093.7 8.967782

36134 rows × 8 columns

In [127]:
import pandas as pd


uk_hotels_summary = uk_hotels_df[['Hotel_Name', 'Reviewer_Score', 'Review_Date']].copy()

# Step 2: Converting 'Review_Date' to datetime
uk_hotels_summary['Review_Date'] = pd.to_datetime(uk_hotels_summary['Review_Date'])

# Step 3: Creating 'Year_Week' (week of year)
uk_hotels_summary['Year_Week'] = uk_hotels_summary['Review_Date'].dt.strftime('%Y-%U')

# Step 4: Grouping by hotel and week to calculate stats
weekly_summary = uk_hotels_summary.groupby(['Hotel_Name', 'Year_Week']).agg(
    Reviews_This_Week=('Reviewer_Score', 'count'),
    Sum_Scores_This_Week=('Reviewer_Score', 'sum'),
    Avg_Score_This_Week=('Reviewer_Score', 'mean')
).reset_index()

# Step 5: Sorting for cumulative calculations
weekly_summary = weekly_summary.sort_values(['Hotel_Name', 'Year_Week'])

# Step 6: Calculating cumulative metrics
weekly_summary['Cumulative_Reviews'] = weekly_summary.groupby('Hotel_Name')['Reviews_This_Week'].cumsum()
weekly_summary['Cumulative_Sum_Scores'] = weekly_summary.groupby('Hotel_Name')['Sum_Scores_This_Week'].cumsum()
weekly_summary['Cumulative_Avg_Score'] = (
    weekly_summary['Cumulative_Sum_Scores'] / weekly_summary['Cumulative_Reviews']
)

# Step 7:  HTML
styled_table = weekly_summary.head(30).style.set_table_attributes('border="1" class="dataframe table table-hover table-bordered"') \
    .set_caption("Weekly Summary of Hotel Reviews")

styled_table.to_html("weekly_summary_table.html")

print("✅ HTML table saved. You can open 'weekly_summary_table.html' in Word.")
✅ HTML table saved. You can open 'weekly_summary_table.html' in Word.
In [27]:
# All reviews in the same hotel and same week are aggregated into one row. because of that 262301 rows tuened into 262301 rows
In [131]:
fig = plt.figure(figsize=(11, 6), dpi=200)

# Plotting each hotel
for hotel in top_hotels:
    hotel_data = monthly_summary[monthly_summary['Hotel_Name'] == hotel]
    plt.plot(hotel_data['Year_Month'], hotel_data['Cumulative_Avg_Score'], label=hotel)

# Setting titles and labels
plt.title('Cumulative Avg. Score Over Time (Top 10 UK Hotels)', fontsize=8)
plt.xlabel('Time (Monthly)', fontsize=8)
plt.ylabel('Cumulative Avg. Score', fontsize=8)

plt.xticks(fontsize=8)
plt.yticks(fontsize=8)

# Legend 
plt.legend(title='Hotel Name', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=6, title_fontsize=7)

# Grid and layout
plt.grid(True)
plt.tight_layout()

plt.savefig("top10_hotels_scores.png", bbox_inches='tight')
plt.show()
In [29]:
# plotting Cumulative_Avg_Score vs. Cumulative_Reviews for the top 10 UK hotels

plt.figure(figsize=(14, 8))

for hotel in top_hotels:
    hotel_data = weekly_summary[weekly_summary['Hotel_Name'] == hotel]
    plt.plot(hotel_data['Cumulative_Reviews'], hotel_data['Cumulative_Avg_Score'], label=hotel)

plt.title('Cumulative Average Score vs. Cumulative Reviews (Top 10 UK Hotels)')
plt.xlabel('Cumulative Number of Reviews')
plt.ylabel('Cumulative Average Score')
plt.legend(title='Hotel Name', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()
In [20]:
# time variation without fix effect : 
# Cumulative_Avg_Score_it = α+β⋅log(Cumulative_Reviews_it)+ ε_it
In [29]:
import statsmodels.api as sm


regression_data = weekly_summary[weekly_summary['Cumulative_Reviews'] > 0].copy()

# dependent variable (Y) and independent variable (X)
Y = regression_data['Cumulative_Avg_Score']
X = np.log(regression_data['Cumulative_Reviews'])

# Adding constant to the independent variable
X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

model.summary()
Out[29]:
OLS Regression Results
Dep. Variable: Cumulative_Avg_Score R-squared: 0.013
Model: OLS Adj. R-squared: 0.012
Method: Least Squares F-statistic: 458.0
Date: Wed, 07 May 2025 Prob (F-statistic): 5.68e-101
Time: 12:37:40 Log-Likelihood: -36227.
No. Observations: 36134 AIC: 7.246e+04
Df Residuals: 36132 BIC: 7.247e+04
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 8.6995 0.014 622.259 0.000 8.672 8.727
Cumulative_Reviews -0.0555 0.003 -21.400 0.000 -0.061 -0.050
Omnibus: 3352.392 Durbin-Watson: 0.066
Prob(Omnibus): 0.000 Jarque-Bera (JB): 4917.817
Skew: -0.727 Prob(JB): 0.00
Kurtosis: 4.074 Cond. No. 22.4


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# without log transformation. 
In [32]:
Y = regression_data['Cumulative_Avg_Score']
X = regression_data['Cumulative_Reviews']

X = sm.add_constant(X)


model_no_log = sm.OLS(Y, X).fit()

model_no_log.summary()
Out[32]:
OLS Regression Results
Dep. Variable: Cumulative_Avg_Score R-squared: 0.011
Model: OLS Adj. R-squared: 0.011
Method: Least Squares F-statistic: 409.8
Date: Thu, 17 Apr 2025 Prob (F-statistic): 1.31e-90
Time: 22:24:43 Log-Likelihood: -36250.
No. Observations: 36134 AIC: 7.250e+04
Df Residuals: 36132 BIC: 7.252e+04
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 8.4676 0.004 1883.124 0.000 8.459 8.476
Cumulative_Reviews -0.0002 7.83e-06 -20.243 0.000 -0.000 -0.000
Omnibus: 2906.001 Durbin-Watson: 0.065
Prob(Omnibus): 0.000 Jarque-Bera (JB): 4027.959
Skew: -0.675 Prob(JB): 0.00
Kurtosis: 3.923 Cond. No. 744.


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# Now I want to comapre the impact of early review with late reviews ( early reviews are 25 percent of the fisrt reviews)
In [129]:
regression_data = weekly_summary.copy()

regression_data = regression_data[regression_data['Cumulative_Reviews'] > 0].copy()

# For each hotel, calculating the total cumulative reviews (final total)
final_reviews = regression_data.groupby('Hotel_Name')['Cumulative_Reviews'].max().reset_index()
final_reviews.rename(columns={'Cumulative_Reviews': 'Final_Cumulative_Reviews'}, inplace=True)

# Merging final cumulative reviews back into the data
regression_data = regression_data.merge(final_reviews, on='Hotel_Name', how='left')

# Defining Early_Period (1 if cumulative reviews < 25% of final, else 0)
regression_data['Early_Period'] = (regression_data['Cumulative_Reviews'] < 0.25 * regression_data['Final_Cumulative_Reviews']).astype(int)

#  interaction term
regression_data['Interaction_Term'] = regression_data['Cumulative_Reviews'] * regression_data['Early_Period']


Y = regression_data['Cumulative_Avg_Score']
X = regression_data[['Cumulative_Reviews', 'Early_Period', 'Interaction_Term']]


X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.023
Model:                              OLS   Adj. R-squared:                  0.023
Method:                   Least Squares   F-statistic:                     279.3
Date:                  Thu, 08 May 2025   Prob (F-statistic):          3.21e-179
Time:                          00:35:41   Log-Likelihood:                -36040.
No. Observations:                 36134   AIC:                         7.209e+04
Df Residuals:                     36130   BIC:                         7.212e+04
Df Model:                             3                                         
Covariance Type:              nonrobust                                         
======================================================================================
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  8.5226      0.006   1512.262      0.000       8.512       8.534
Cumulative_Reviews    -0.0002   8.48e-06    -23.945      0.000      -0.000      -0.000
Early_Period          -0.0898      0.010     -8.706      0.000      -0.110      -0.070
Interaction_Term      -0.0006   6.23e-05    -10.217      0.000      -0.001      -0.001
==============================================================================
Omnibus:                     2798.473   Durbin-Watson:                   0.066
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3838.794
Skew:                          -0.661   Prob(JB):                         0.00
Kurtosis:                       3.896   Cond. No.                     1.81e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.81e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
# Now I want to add hotel fixed effect. 
In [33]:
# Converting 'Year_Week' to datetime (start of week)
panel_data = weekly_summary.copy()
panel_data = panel_data[panel_data['Cumulative_Reviews'] > 0].copy()

panel_data['Year_Week_Date'] = pd.to_datetime(panel_data['Year_Week'] + '-1', format='%Y-%U-%w')

# multi-indexing with proper datetime index for time
panel_data = panel_data.set_index(['Hotel_Name', 'Year_Week_Date'])

# log of cumulative reviews
panel_data['log_Cumulative_Reviews'] = np.log(panel_data['Cumulative_Reviews'])


Y = panel_data['Cumulative_Avg_Score']
X = panel_data[['log_Cumulative_Reviews']]
X = sm.add_constant(X)

#  the fixed effects model (hotel fixed effects)
from linearmodels.panel import PanelOLS
model_fe = PanelOLS(Y, X, entity_effects=True)
results_fe = model_fe.fit()


print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.0076
Estimator:                     PanelOLS   R-squared (Between):             -0.0227
No. Observations:                 36134   R-squared (Within):               0.0076
Date:                  Wed, May 07 2025   R-squared (Overall):             -0.0086
Time:                          12:38:16   Log-likelihood                    8266.4
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      274.65
Entities:                           400   P-value                           0.0000
Avg Obs:                         90.335   Distribution:                 F(1,35733)
Min Obs:                        10.0000                                           
Max Obs:                         106.00   F-statistic (robust):             274.65
                                          P-value                           0.0000
Time periods:                       105   Distribution:                 F(1,35733)
Avg Obs:                         344.13                                           
Min Obs:                         299.00                                           
Max Obs:                         496.00                                           
                                                                                  
                                   Parameter Estimates                                    
==========================================================================================
                        Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------
const                      8.3232     0.0053     1564.9     0.0000      8.3128      8.3336
log_Cumulative_Reviews     0.0166     0.0010     16.573     0.0000      0.0146      0.0185
==========================================================================================

F-test for Poolability: 961.48
P-value: 0.0000
Distribution: F(399,35733)

Included effects: Entity
In [ ]:
# adding time fixed effect to hotel fixed effect
In [35]:
model_fe_time = PanelOLS(
    panel_data['Cumulative_Avg_Score'],
    X,
    entity_effects=True,  # hotel fixed effects
    time_effects=True     # time fixed effects (week)
)

results_fe_time = model_fe_time.fit()

print(results_fe_time.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.0004
Estimator:                     PanelOLS   R-squared (Between):             -0.0218
No. Observations:                 36134   R-squared (Within):               0.0076
Date:                  Thu, Apr 17 2025   R-squared (Overall):             -0.0081
Time:                          22:25:03   Log-likelihood                    8707.8
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      15.326
Entities:                           400   P-value                           0.0001
Avg Obs:                         90.335   Distribution:                 F(1,35629)
Min Obs:                        10.0000                                           
Max Obs:                         106.00   F-statistic (robust):             15.326
                                          P-value                           0.0001
Time periods:                       105   Distribution:                 F(1,35629)
Avg Obs:                         344.13                                           
Min Obs:                         299.00                                           
Max Obs:                         496.00                                           
                                                                                  
                                   Parameter Estimates                                    
==========================================================================================
                        Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------
const                      8.3274     0.0210     395.76     0.0000      8.2862      8.3687
log_Cumulative_Reviews     0.0158     0.0040     3.9149     0.0001      0.0079      0.0236
==========================================================================================

F-test for Poolability: 781.03
P-value: 0.0000
Distribution: F(503,35629)

Included effects: Entity, Time
In [ ]:
# now the impact of early periods reviews with fixed effect
In [34]:
if 'Final_Cumulative_Reviews' not in panel_data.columns:
    final_reviews = panel_data.groupby('Hotel_Name')['Cumulative_Reviews'].transform('max')
    panel_data['Final_Cumulative_Reviews'] = final_reviews

#  Early_Period and Interaction term
panel_data['Early_Period'] = (panel_data['Cumulative_Reviews'] < 0.25 * panel_data['Final_Cumulative_Reviews']).astype(int)
panel_data['Interaction_Term'] = panel_data['Cumulative_Reviews'] * panel_data['Early_Period']

Y = panel_data['Cumulative_Avg_Score']
X = panel_data[['Cumulative_Reviews', 'Early_Period', 'Interaction_Term']]
X = sm.add_constant(X)

#  the fixed effects regression (hotel and time effects)
model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.0024
Estimator:                     PanelOLS   R-squared (Between):             -0.0176
No. Observations:                 36134   R-squared (Within):               0.0065
Date:                  Wed, May 07 2025   R-squared (Overall):             -0.0080
Time:                          12:38:28   Log-likelihood                    8743.0
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      28.297
Entities:                           400   P-value                           0.0000
Avg Obs:                         90.335   Distribution:                 F(3,35627)
Min Obs:                        10.0000                                           
Max Obs:                         106.00   F-statistic (robust):             28.297
                                          P-value                           0.0000
Time periods:                       105   Distribution:                 F(3,35627)
Avg Obs:                         344.13                                           
Min Obs:                         299.00                                           
Max Obs:                         496.00                                           
                                                                                  
                                 Parameter Estimates                                  
======================================================================================
                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
--------------------------------------------------------------------------------------
const                  8.3917     0.0027     3110.8     0.0000      8.3864      8.3970
Cumulative_Reviews  4.774e-05  5.898e-06     8.0951     0.0000   3.618e-05    5.93e-05
Early_Period           0.0014     0.0063     0.2217     0.8246     -0.0109      0.0137
Interaction_Term    1.003e-05  2.591e-05     0.3871     0.6987  -4.076e-05   6.082e-05
======================================================================================

F-test for Poolability: 773.88
P-value: 0.0000
Distribution: F(503,35627)

Included effects: Entity, Time
In [ ]:
# 1- Without controlling for hotel or time, we see that a higher number of cumulative reviews is associated with lower average scores, hinting at a decline in ratings over time.
# Once we apply fixed effects, this relationship turns slightly positive, suggesting that when hotel-specific and time-specific factors are held constant, accumulating reviews actually help boost average scores.
# When we introduce early reviews in the pooled OLS model, they show a significant negative impact, indicating that early reviews tend to pull scores down as reviews accumulate.
# However, in the fixed effects model, both the early review term and its interaction lose significance, implying that early dynamics are already explained by differences between hotels or time periods.
# Overall, this suggests that what looks like an early review effect in simple models is actually driven by underlying hotel characteristics or broader time trends.
In [ ]:
# Now I want to add the count or the numbers of positive and negative words to the rgression. because maybe some of the
#reviews have more negative and some other more positive words , so it is not just the nymbers of reviews that can affect
#avg score and it can be influenced by the quality or senstiment of the revies. 
In [35]:
extended_weekly_summary = weekly_summary.copy()


# Extracting needed columns for word counts
word_counts = uk_hotels_df[['Hotel_Name', 'Review_Date', 
                            'Review_Total_Positive_Word_Counts', 
                            'Review_Total_Negative_Word_Counts']].copy()

# Converting 'Review_Date' to datetime and create 'Year_Week' column
word_counts['Review_Date'] = pd.to_datetime(word_counts['Review_Date'])
word_counts['Year_Week'] = word_counts['Review_Date'].dt.strftime('%Y-%U')

#  Grouping by Hotel and Week to sum word counts
weekly_word_counts = word_counts.groupby(['Hotel_Name', 'Year_Week']).agg(
    Total_Positive_Words_This_Week=('Review_Total_Positive_Word_Counts', 'sum'),
    Total_Negative_Words_This_Week=('Review_Total_Negative_Word_Counts', 'sum')
).reset_index()

#  Mergeing with your existing weekly summary
extended_weekly_summary = extended_weekly_summary.merge(
    weekly_word_counts, on=['Hotel_Name', 'Year_Week'], how='left'
)

#  Sorting for cumulative calculations
extended_weekly_summary = extended_weekly_summary.sort_values(['Hotel_Name', 'Year_Week'])

#  Calculating cumulative sums
extended_weekly_summary['Cumulative_Sum_Positive'] = extended_weekly_summary.groupby('Hotel_Name')['Total_Positive_Words_This_Week'].cumsum()
extended_weekly_summary['Cumulative_Sum_Negative'] = extended_weekly_summary.groupby('Hotel_Name')['Total_Negative_Words_This_Week'].cumsum()

#  Calculating cumulative averages (avoid division by zero)
extended_weekly_summary['Cumulative_Avg_Positive'] = extended_weekly_summary['Cumulative_Sum_Positive'] / extended_weekly_summary['Cumulative_Reviews']
extended_weekly_summary['Cumulative_Avg_Negative'] = extended_weekly_summary['Cumulative_Sum_Negative'] / extended_weekly_summary['Cumulative_Reviews']

extended_weekly_summary.head()
Out[35]:
Hotel_Name Year_Week Reviews_This_Week Sum_Scores_This_Week Avg_Score_This_Week Cumulative_Reviews Cumulative_Sum_Scores Cumulative_Avg_Score Total_Positive_Words_This_Week Total_Negative_Words_This_Week Cumulative_Sum_Positive Cumulative_Sum_Negative Cumulative_Avg_Positive Cumulative_Avg_Negative
0 11 Cadogan Gardens 2015-32 1 10.0 10.00 1 10.0 10.00 7 0 7 0 7.000000 0.000000
1 11 Cadogan Gardens 2015-35 1 10.0 10.00 2 20.0 10.00 51 0 58 0 29.000000 0.000000
2 11 Cadogan Gardens 2015-36 2 13.8 6.90 4 33.8 8.45 16 46 74 46 18.500000 11.500000
3 11 Cadogan Gardens 2015-37 1 9.2 9.20 5 43.0 8.60 2 15 76 61 15.200000 12.200000
4 11 Cadogan Gardens 2015-38 2 17.9 8.95 7 60.9 8.70 56 14 132 75 18.857143 10.714286
In [ ]:
# now adding avg cumulative negative and positive words for the regression without fix effect 
In [139]:
#  data for regression (use the already extended_weekly_summary table)
regression_data = extended_weekly_summary.copy()

# Removing rows with zero cumulative reviews to avoid division by zero or log(0) issues
regression_data = regression_data[regression_data['Cumulative_Reviews'] > 0].copy()


Y = regression_data['Cumulative_Avg_Score']

X = regression_data[['Cumulative_Reviews', 'Cumulative_Avg_Positive', 'Cumulative_Avg_Negative']]

X = sm.add_constant(X)

# OLS regression (time variation model, no fixed effects)
model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.585
Model:                              OLS   Adj. R-squared:                  0.585
Method:                   Least Squares   F-statistic:                 1.699e+04
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          08:54:26   Log-Likelihood:                -20556.
No. Observations:                 36134   AIC:                         4.112e+04
Df Residuals:                     36130   BIC:                         4.115e+04
Df Model:                             3                                         
Covariance Type:              nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                       8.2021      0.013    613.032      0.000       8.176       8.228
Cumulative_Reviews      -1.734e-05   5.12e-06     -3.386      0.001   -2.74e-05    -7.3e-06
Cumulative_Avg_Positive     0.0759      0.001    125.770      0.000       0.075       0.077
Cumulative_Avg_Negative    -0.0580      0.000   -149.766      0.000      -0.059      -0.057
==============================================================================
Omnibus:                    23711.626   Durbin-Watson:                   0.235
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          8997256.707
Skew:                           1.956   Prob(JB):                         0.00
Kurtosis:                      80.205   Cond. No.                     3.42e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.42e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
# Testing for multicollinearity 
In [140]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# independent variables (without the constant for VIF calculation)
X_vif = regression_data[['Cumulative_Reviews', 'Cumulative_Avg_Positive', 'Cumulative_Avg_Negative']]

X_vif_const = sm.add_constant(X_vif)

vif_data = pd.DataFrame()
vif_data['Feature'] = X_vif_const.columns
vif_data['VIF'] = [variance_inflation_factor(X_vif_const.values, i) for i in range(X_vif_const.shape[1])]

print(vif_data)
                   Feature        VIF
0                    const  35.408861
1       Cumulative_Reviews   1.019598
2  Cumulative_Avg_Positive   1.062223
3  Cumulative_Avg_Negative   1.080081
In [ ]:
# the same model with  adding lagged effect, log transformation for all ind variables: this is better one. 
In [141]:
import statsmodels.api as sm

# Copy the data
regression_data = extended_weekly_summary.copy()

# Cleaning data: remove zero or negative values before log transformation
regression_data = regression_data[
    (regression_data['Cumulative_Reviews'] > 0) &
    (regression_data['Cumulative_Avg_Positive'] > 0) &
    (regression_data['Cumulative_Avg_Negative'] > 0)
].copy()

# Sorting for lagging
regression_data = regression_data.sort_values(['Hotel_Name', 'Year_Week'])

# Creating lagged versions of sentiment variables
regression_data['Lagged_Cumulative_Avg_Positive'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
regression_data['Lagged_Cumulative_Avg_Negative'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

# Dropping rows with NA after lagging
regression_data = regression_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

# Applying log transformation
regression_data['log_Cumulative_Reviews'] = np.log(regression_data['Cumulative_Reviews'])
regression_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(regression_data['Lagged_Cumulative_Avg_Positive'])
regression_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(regression_data['Lagged_Cumulative_Avg_Negative'])

#  Y and X
Y = regression_data['Cumulative_Avg_Score']
X = regression_data[['log_Cumulative_Reviews', 'log_Lagged_Cumulative_Avg_Positive', 'log_Lagged_Cumulative_Avg_Negative']]
X = sm.add_constant(X)


model = sm.OLS(Y, X).fit()


print(model.summary())

#  summary to a text file
with open("regression_summary.txt", "w") as f:
    f.write(model.summary().as_text())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.613
Model:                              OLS   Adj. R-squared:                  0.613
Method:                   Least Squares   F-statistic:                 1.884e+04
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          08:54:33   Log-Likelihood:                -18525.
No. Observations:                 35654   AIC:                         3.706e+04
Df Residuals:                     35650   BIC:                         3.709e+04
Df Model:                             3                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  7.9963      0.035    229.582      0.000       7.928       8.065
log_Cumulative_Reviews                 0.0048      0.002      2.662      0.008       0.001       0.008
log_Lagged_Cumulative_Avg_Positive     1.2263      0.009    131.016      0.000       1.208       1.245
log_Lagged_Cumulative_Avg_Negative    -1.0648      0.007   -154.329      0.000      -1.078      -1.051
==============================================================================
Omnibus:                     5149.779   Durbin-Watson:                   0.246
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            51226.815
Skew:                          -0.371   Prob(JB):                         0.00
Kurtosis:                       8.825   Cond. No.                         113.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [34]:
# same models with just log for cummulative reviews.
In [37]:
regression_data = extended_weekly_summary.copy()

regression_data = regression_data[regression_data['Cumulative_Reviews'] > 0].copy()

regression_data = regression_data.sort_values(['Hotel_Name', 'Year_Week'])

regression_data['Lagged_Cumulative_Avg_Positive'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
regression_data['Lagged_Cumulative_Avg_Negative'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

regression_data = regression_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

# Applying log transformation only to Cumulative_Reviews
regression_data['log_Cumulative_Reviews'] = np.log(regression_data['Cumulative_Reviews'])

Y = regression_data['Cumulative_Avg_Score']
X = regression_data[['log_Cumulative_Reviews', 'Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative']]

X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.565
Model:                              OLS   Adj. R-squared:                  0.565
Method:                   Least Squares   F-statistic:                 1.550e+04
Date:                  Wed, 07 May 2025   Prob (F-statistic):               0.00
Time:                          12:39:00   Log-Likelihood:                -20711.
No. Observations:                 35734   AIC:                         4.143e+04
Df Residuals:                     35730   BIC:                         4.146e+04
Df Model:                             3                                         
Covariance Type:              nonrobust                                         
==================================================================================================
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const                              8.2582      0.016    520.136      0.000       8.227       8.289
log_Cumulative_Reviews            -0.0168      0.002     -9.202      0.000      -0.020      -0.013
Lagged_Cumulative_Avg_Positive     0.0746      0.001    121.670      0.000       0.073       0.076
Lagged_Cumulative_Avg_Negative    -0.0553      0.000   -140.219      0.000      -0.056      -0.055
==============================================================================
Omnibus:                    24256.846   Durbin-Watson:                   0.281
Prob(Omnibus):                  0.000   Jarque-Bera (JB):         10543634.815
Skew:                           2.049   Prob(JB):                         0.00
Kurtosis:                      87.051   Cond. No.                         173.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# Now Early period variable and its interaction added to regression withou log transformation for ave cumlative neg and positve 
In [36]:
regression_data = extended_weekly_summary.copy()

regression_data = regression_data[regression_data['Cumulative_Reviews'] > 0].copy()

regression_data = regression_data.sort_values(['Hotel_Name', 'Year_Week'])

regression_data['Lagged_Cumulative_Avg_Positive'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
regression_data['Lagged_Cumulative_Avg_Negative'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

#  Defining Early_Period dummy variable
#  calculating final cumulative reviews per hotel
regression_data['Final_Cumulative_Reviews'] = regression_data.groupby('Hotel_Name')['Cumulative_Reviews'].transform('max')
regression_data['Early_Period'] = (regression_data['Cumulative_Reviews'] < 0.25 * regression_data['Final_Cumulative_Reviews']).astype(int)

#  Droping rows with NA after lagging
regression_data = regression_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

regression_data['log_Cumulative_Reviews'] = np.log(regression_data['Cumulative_Reviews'])

#  Creating interaction term
regression_data['Interaction_Term'] = regression_data['log_Cumulative_Reviews'] * regression_data['Early_Period']

#  Defining dependent and independent variables
Y = regression_data['Cumulative_Avg_Score']
X = regression_data[['log_Cumulative_Reviews', 
                     'Lagged_Cumulative_Avg_Positive', 
                     'Lagged_Cumulative_Avg_Negative', 
                     'Early_Period', 
                     'Interaction_Term']]

X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.570
Model:                              OLS   Adj. R-squared:                  0.570
Method:                   Least Squares   F-statistic:                     9458.
Date:                  Mon, 07 Apr 2025   Prob (F-statistic):               0.00
Time:                          13:27:40   Log-Likelihood:                -20537.
No. Observations:                 35734   AIC:                         4.109e+04
Df Residuals:                     35728   BIC:                         4.114e+04
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
==================================================================================================
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const                              8.4494      0.021    409.139      0.000       8.409       8.490
log_Cumulative_Reviews            -0.0438      0.003    -15.684      0.000      -0.049      -0.038
Lagged_Cumulative_Avg_Positive     0.0733      0.001    119.331      0.000       0.072       0.074
Lagged_Cumulative_Avg_Negative    -0.0550      0.000   -140.066      0.000      -0.056      -0.054
Early_Period                      -0.1352      0.023     -5.921      0.000      -0.180      -0.090
Interaction_Term                   0.0026      0.005      0.534      0.593      -0.007       0.012
==============================================================================
Omnibus:                    24219.792   Durbin-Watson:                   0.280
Prob(Omnibus):                  0.000   Jarque-Bera (JB):         10362800.372
Skew:                           2.047   Prob(JB):                         0.00
Kurtosis:                      86.326   Cond. No.                         301.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# now the same with log for all ind vraiables. 
In [153]:
regression_data = extended_weekly_summary.copy()

#  removing zero or negative values (log safety)
regression_data = regression_data[
    (regression_data['Cumulative_Reviews'] > 0) &
    (regression_data['Cumulative_Avg_Positive'] > 0) &
    (regression_data['Cumulative_Avg_Negative'] > 0)
].copy()

# Sorting for lagging
regression_data = regression_data.sort_values(['Hotel_Name', 'Year_Week'])

#  lagged versions of sentiment variables
regression_data['Lagged_Cumulative_Avg_Positive'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
regression_data['Lagged_Cumulative_Avg_Negative'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

# Droping rows with NA after lagging
regression_data = regression_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

#  Log transform all continuous predictors
regression_data['log_Cumulative_Reviews'] = np.log(regression_data['Cumulative_Reviews'])
regression_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(regression_data['Lagged_Cumulative_Avg_Positive'])
regression_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(regression_data['Lagged_Cumulative_Avg_Negative'])

# Defining Early_Period dummy
regression_data['Final_Cumulative_Reviews'] = regression_data.groupby('Hotel_Name')['Cumulative_Reviews'].transform('max')
regression_data['Early_Period'] = (regression_data['Cumulative_Reviews'] < 0.25 * regression_data['Final_Cumulative_Reviews']).astype(int)

#  interaction term
regression_data['Interaction_Term'] = regression_data['log_Cumulative_Reviews'] * regression_data['Early_Period']


Y = regression_data['Cumulative_Avg_Score']
X = regression_data[['log_Cumulative_Reviews',
                     'log_Lagged_Cumulative_Avg_Positive',
                     'log_Lagged_Cumulative_Avg_Negative',
                     'Early_Period',
                     'Interaction_Term']]


X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.617
Model:                              OLS   Adj. R-squared:                  0.617
Method:                   Least Squares   F-statistic:                 1.147e+04
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          11:18:47   Log-Likelihood:                -18367.
No. Observations:                 35654   AIC:                         3.675e+04
Df Residuals:                     35648   BIC:                         3.680e+04
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.2455      0.038    216.756      0.000       8.171       8.320
log_Cumulative_Reviews                -0.0258      0.003     -9.710      0.000      -0.031      -0.021
log_Lagged_Cumulative_Avg_Positive     1.2037      0.009    127.926      0.000       1.185       1.222
log_Lagged_Cumulative_Avg_Negative    -1.0628      0.007   -154.368      0.000      -1.076      -1.049
Early_Period                          -0.2287      0.022    -10.421      0.000      -0.272      -0.186
Interaction_Term                       0.0273      0.005      5.894      0.000       0.018       0.036
==============================================================================
Omnibus:                     4938.203   Durbin-Watson:                   0.245
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            49187.911
Skew:                          -0.330   Prob(JB):                         0.00
Kurtosis:                       8.716   Cond. No.                         127.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [41]:
regression_data = extended_weekly_summary.copy()

# Remove zero or negative values (log safety)
regression_data = regression_data[
    (regression_data['Cumulative_Reviews'] > 0) &
    (regression_data['Cumulative_Avg_Positive'] > 0) &
    (regression_data['Cumulative_Avg_Negative'] > 0)
].copy()

# Sort for lagging
regression_data = regression_data.sort_values(['Hotel_Name', 'Year_Week'])

# Create lagged versions of sentiment variables
regression_data['Lagged_Cumulative_Avg_Positive'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
regression_data['Lagged_Cumulative_Avg_Negative'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

# Drop rows with NA after lagging
regression_data = regression_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

# Log transform all continuous predictors
regression_data['log_Cumulative_Reviews'] = np.log(regression_data['Cumulative_Reviews'])
regression_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(regression_data['Lagged_Cumulative_Avg_Positive'])
regression_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(regression_data['Lagged_Cumulative_Avg_Negative'])

# Define Final Cumulative Reviews and Late Period Dummy
regression_data['Final_Cumulative_Reviews'] = regression_data.groupby('Hotel_Name')['Cumulative_Reviews'].transform('max')
regression_data['Late_Period'] = (regression_data['Cumulative_Reviews'] > 0.75 * regression_data['Final_Cumulative_Reviews']).astype(int)

# Interaction term: Late_Period × log(Cumulative Reviews)
regression_data['Interaction_Term_Late'] = regression_data['log_Cumulative_Reviews'] * regression_data['Late_Period']

# Define response and predictors
Y = regression_data['Cumulative_Avg_Score']
X = regression_data[['log_Cumulative_Reviews',
                     'log_Lagged_Cumulative_Avg_Positive',
                     'log_Lagged_Cumulative_Avg_Negative',
                     'Late_Period',
                     'Interaction_Term_Late']]

X = sm.add_constant(X)

# Run the model
model = sm.OLS(Y, X).fit()

# Show the summary
print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.615
Model:                              OLS   Adj. R-squared:                  0.615
Method:                   Least Squares   F-statistic:                 1.138e+04
Date:                  Wed, 07 May 2025   Prob (F-statistic):               0.00
Time:                          12:39:36   Log-Likelihood:                -18452.
No. Observations:                 35654   AIC:                         3.692e+04
Df Residuals:                     35648   BIC:                         3.697e+04
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.0156      0.035    228.913      0.000       7.947       8.084
log_Cumulative_Reviews                 0.0049      0.002      2.302      0.021       0.001       0.009
log_Lagged_Cumulative_Avg_Positive     1.2157      0.009    129.546      0.000       1.197       1.234
log_Lagged_Cumulative_Avg_Negative    -1.0650      0.007   -154.615      0.000      -1.079      -1.052
Late_Period                            0.3096      0.032      9.724      0.000       0.247       0.372
Interaction_Term_Late                 -0.0442      0.005     -8.389      0.000      -0.054      -0.034
==============================================================================
Omnibus:                     5106.448   Durbin-Watson:                   0.245
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            50823.520
Skew:                          -0.363   Prob(JB):                         0.00
Kurtosis:                       8.804   Cond. No.                         118.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# time fix effect model with log for all. 
In [145]:
panel_data = extended_weekly_summary.copy()

panel_data = panel_data[
    (panel_data['Cumulative_Reviews'] > 0) &
    (panel_data['Cumulative_Avg_Positive'] > 0) &
    (panel_data['Cumulative_Avg_Negative'] > 0)
].copy()

panel_data = panel_data.sort_values(['Hotel_Name', 'Year_Week'])

panel_data['Lagged_Cumulative_Avg_Positive'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
panel_data['Lagged_Cumulative_Avg_Negative'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

panel_data = panel_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

panel_data['log_Cumulative_Reviews'] = np.log(panel_data['Cumulative_Reviews'])
panel_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(panel_data['Lagged_Cumulative_Avg_Positive'])
panel_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(panel_data['Lagged_Cumulative_Avg_Negative'])

panel_data['Year_Week_Date'] = pd.to_datetime(panel_data['Year_Week'] + '-1', format='%Y-%U-%w')

#  Setting panel data index (entity = Hotel_Name, time = Year_Week_Date)
panel_data = panel_data.set_index(['Hotel_Name', 'Year_Week_Date'])

Y = panel_data['Cumulative_Avg_Score']
X = panel_data[['log_Cumulative_Reviews', 
                'log_Lagged_Cumulative_Avg_Positive', 
                'log_Lagged_Cumulative_Avg_Negative']]

X = sm.add_constant(X)

model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.1964
Estimator:                     PanelOLS   R-squared (Between):              0.2751
No. Observations:                 35654   R-squared (Within):               0.1917
Date:                  Thu, May 08 2025   R-squared (Overall):              0.2908
Time:                          10:04:25   Log-likelihood                 1.913e+04
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      2862.7
Entities:                           400   P-value                           0.0000
Avg Obs:                         89.135   Distribution:                 F(3,35148)
Min Obs:                         7.0000                                           
Max Obs:                         105.00   F-statistic (robust):             2862.7
                                          P-value                           0.0000
Time periods:                       104   Distribution:                 F(3,35148)
Avg Obs:                         342.83                                           
Min Obs:                         252.00                                           
Max Obs:                         495.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.8796     0.0260     303.64     0.0000      7.8287      7.9304
log_Cumulative_Reviews                 0.0492     0.0035     14.141     0.0000      0.0424      0.0560
log_Lagged_Cumulative_Avg_Positive     0.4034     0.0062     64.704     0.0000      0.3912      0.4156
log_Lagged_Cumulative_Avg_Negative    -0.2999     0.0044    -68.943     0.0000     -0.3085     -0.2914
======================================================================================================

F-test for Poolability: 508.80
P-value: 0.0000
Distribution: F(502,35148)

Included effects: Entity, Time
In [ ]:
# same with early period . 
In [149]:
panel_data = extended_weekly_summary.copy()

# Clean data: remove zero or negative values (log safety)
panel_data = panel_data[
    (panel_data['Cumulative_Reviews'] > 0) &
    (panel_data['Cumulative_Avg_Positive'] > 0) &
    (panel_data['Cumulative_Avg_Negative'] > 0)
].copy()

panel_data = panel_data.sort_values(['Hotel_Name', 'Year_Week'])

panel_data['Lagged_Cumulative_Avg_Positive'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
panel_data['Lagged_Cumulative_Avg_Negative'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

panel_data = panel_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

panel_data['log_Cumulative_Reviews'] = np.log(panel_data['Cumulative_Reviews'])
panel_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(panel_data['Lagged_Cumulative_Avg_Positive'])
panel_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(panel_data['Lagged_Cumulative_Avg_Negative'])

panel_data['Final_Cumulative_Reviews'] = panel_data.groupby('Hotel_Name')['Cumulative_Reviews'].transform('max')
panel_data['Early_Period'] = (panel_data['Cumulative_Reviews'] < 0.25 * panel_data['Final_Cumulative_Reviews']).astype(int)

panel_data['Year_Week_Date'] = pd.to_datetime(panel_data['Year_Week'] + '-1', format='%Y-%U-%w')

#  panel index
panel_data = panel_data.set_index(['Hotel_Name', 'Year_Week_Date'])

# Define dependent and independent variables
Y = panel_data['Cumulative_Avg_Score']
X = panel_data[['log_Cumulative_Reviews',
                'log_Lagged_Cumulative_Avg_Positive',
                'log_Lagged_Cumulative_Avg_Negative',
                'Early_Period']]

X = sm.add_constant(X)

model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.1967
Estimator:                     PanelOLS   R-squared (Between):              0.2696
No. Observations:                 35654   R-squared (Within):               0.1891
Date:                  Thu, May 08 2025   R-squared (Overall):              0.2872
Time:                          11:11:01   Log-likelihood                 1.914e+04
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      2151.7
Entities:                           400   P-value                           0.0000
Avg Obs:                         89.135   Distribution:                 F(4,35147)
Min Obs:                         7.0000                                           
Max Obs:                         105.00   F-statistic (robust):             2151.7
                                          P-value                           0.0000
Time periods:                       104   Distribution:                 F(4,35147)
Avg Obs:                         342.83                                           
Min Obs:                         252.00                                           
Max Obs:                         495.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.8487     0.0271     289.20     0.0000      7.7955      7.9019
log_Cumulative_Reviews                 0.0547     0.0038     14.565     0.0000      0.0473      0.0620
log_Lagged_Cumulative_Avg_Positive     0.4028     0.0062     64.607     0.0000      0.3906      0.4150
log_Lagged_Cumulative_Avg_Negative    -0.3002     0.0044    -69.003     0.0000     -0.3087     -0.2916
Early_Period                           0.0165     0.0043     3.8793     0.0001      0.0082      0.0249
======================================================================================================

F-test for Poolability: 504.51
P-value: 0.0000
Distribution: F(502,35147)

Included effects: Entity, Time
In [150]:
panel_data = extended_weekly_summary.copy()

)
panel_data = panel_data[
    (panel_data['Cumulative_Reviews'] > 0) &
    (panel_data['Cumulative_Avg_Positive'] > 0) &
    (panel_data['Cumulative_Avg_Negative'] > 0)
].copy()


panel_data = panel_data.sort_values(['Hotel_Name', 'Year_Week'])


panel_data['Lagged_Cumulative_Avg_Positive'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
panel_data['Lagged_Cumulative_Avg_Negative'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)


panel_data = panel_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])


panel_data['log_Cumulative_Reviews'] = np.log(panel_data['Cumulative_Reviews'])
panel_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(panel_data['Lagged_Cumulative_Avg_Positive'])
panel_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(panel_data['Lagged_Cumulative_Avg_Negative'])


panel_data['Final_Cumulative_Reviews'] = panel_data.groupby('Hotel_Name')['Cumulative_Reviews'].transform('max')
panel_data['Early_Period'] = (panel_data['Cumulative_Reviews'] < 0.25 * panel_data['Final_Cumulative_Reviews']).astype(int)

panel_data['Year_Week_Date'] = pd.to_datetime(panel_data['Year_Week'] + '-1', format='%Y-%U-%w')


panel_data['Interaction_Term'] = panel_data['log_Cumulative_Reviews'] * panel_data['Early_Period']

panel_data = panel_data.set_index(['Hotel_Name', 'Year_Week_Date'])


Y = panel_data['Cumulative_Avg_Score']
X = panel_data[['log_Cumulative_Reviews',
                'log_Lagged_Cumulative_Avg_Positive',
                'log_Lagged_Cumulative_Avg_Negative',
                'Early_Period',
                'Interaction_Term']]


X = sm.add_constant(X)


model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()


print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.1974
Estimator:                     PanelOLS   R-squared (Between):              0.2620
No. Observations:                 35654   R-squared (Within):               0.1836
Date:                  Thu, May 08 2025   R-squared (Overall):              0.2819
Time:                          11:11:10   Log-likelihood                 1.915e+04
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      1729.3
Entities:                           400   P-value                           0.0000
Avg Obs:                         89.135   Distribution:                 F(5,35146)
Min Obs:                         7.0000                                           
Max Obs:                         105.00   F-statistic (robust):             1729.3
                                          P-value                           0.0000
Time periods:                       104   Distribution:                 F(5,35146)
Avg Obs:                         342.83                                           
Min Obs:                         252.00                                           
Max Obs:                         495.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.7851     0.0294     265.18     0.0000      7.7275      7.8426
log_Cumulative_Reviews                 0.0635     0.0041     15.630     0.0000      0.0556      0.0715
log_Lagged_Cumulative_Avg_Positive     0.4044     0.0062     64.823     0.0000      0.3921      0.4166
log_Lagged_Cumulative_Avg_Negative    -0.2966     0.0044    -67.512     0.0000     -0.3052     -0.2880
Early_Period                           0.0721     0.0107     6.7456     0.0000      0.0511      0.0930
Interaction_Term                      -0.0115     0.0020    -5.6675     0.0000     -0.0155     -0.0075
======================================================================================================

F-test for Poolability: 504.46
P-value: 0.0000
Distribution: F(502,35146)

Included effects: Entity, Time
In [165]:
panel_data = extended_weekly_summary.copy()

panel_data = panel_data[
    (panel_data['Cumulative_Reviews'] > 0) &
    (panel_data['Cumulative_Avg_Positive'] > 0) &
    (panel_data['Cumulative_Avg_Negative'] > 0)
].copy()

panel_data = panel_data.sort_values(['Hotel_Name', 'Year_Week'])

panel_data['Lagged_Cumulative_Avg_Positive'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
panel_data['Lagged_Cumulative_Avg_Negative'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

panel_data = panel_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

panel_data['log_Cumulative_Reviews'] = np.log(panel_data['Cumulative_Reviews'])
panel_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(panel_data['Lagged_Cumulative_Avg_Positive'])
panel_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(panel_data['Lagged_Cumulative_Avg_Negative'])

panel_data['Final_Cumulative_Reviews'] = panel_data.groupby('Hotel_Name')['Cumulative_Reviews'].transform('max')
panel_data['Late_Period'] = (panel_data['Cumulative_Reviews'] > 0.75 * panel_data['Final_Cumulative_Reviews']).astype(int)

panel_data['Year_Week_Date'] = pd.to_datetime(panel_data['Year_Week'] + '-1', format='%Y-%U-%w')


panel_data['Interaction_Term_Late'] = panel_data['log_Cumulative_Reviews'] * panel_data['Late_Period']

panel_data = panel_data.set_index(['Hotel_Name', 'Year_Week_Date'])

Y = panel_data['Cumulative_Avg_Score']
X = panel_data[['log_Cumulative_Reviews',
                'log_Lagged_Cumulative_Avg_Positive',
                'log_Lagged_Cumulative_Avg_Negative',
                'Late_Period',
                'Interaction_Term_Late']]

X = sm.add_constant(X)

model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.1964
Estimator:                     PanelOLS   R-squared (Between):              0.2744
No. Observations:                 35654   R-squared (Within):               0.1879
Date:                  Thu, May 08 2025   R-squared (Overall):              0.2902
Time:                          11:38:15   Log-likelihood                 1.913e+04
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      1717.9
Entities:                           400   P-value                           0.0000
Avg Obs:                         89.135   Distribution:                 F(5,35146)
Min Obs:                         7.0000                                           
Max Obs:                         105.00   F-statistic (robust):             1717.9
                                          P-value                           0.0000
Time periods:                       104   Distribution:                 F(5,35146)
Avg Obs:                         342.83                                           
Min Obs:                         252.00                                           
Max Obs:                         495.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.8760     0.0261     301.71     0.0000      7.8248      7.9272
log_Cumulative_Reviews                 0.0496     0.0035     14.196     0.0000      0.0427      0.0564
log_Lagged_Cumulative_Avg_Positive     0.4034     0.0062     64.692     0.0000      0.3911      0.4156
log_Lagged_Cumulative_Avg_Negative    -0.2998     0.0044    -68.759     0.0000     -0.3084     -0.2913
Late_Period                           -0.0030     0.0127    -0.2396     0.8107     -0.0279      0.0218
Interaction_Term_Late                  0.0013     0.0020     0.6474     0.5174     -0.0026      0.0052
======================================================================================================

F-test for Poolability: 506.44
P-value: 0.0000
Distribution: F(502,35146)

Included effects: Entity, Time
In [ ]:
# 2- The fixed effects model reveals a stronger positive impact of cumulative reviews on average score compared to pooled OLS, especially when early reviews are included (+0.0547 vs. +0.0048).
# Early reviews show a contrasting effect: negative in pooled OLS (suggesting early adopters are critical across hotels), but positive in fixed effects (indicating within-hotel early users are more favorable).
# Positive words significantly boost the score, while negative words lower it sharply in both models, confirming that review sentiment is a key driver beyond review count.
# With fixed effects, the influence of sentiment is clearer, showing cumulative positivity reinforces scores over time, while negative sentiment consistently pulls scores down.
# Overall, fixed effects capture within-hotel dynamics better, revealing how both the quantity and quality (positive/negative words) of reviews shape perceived hotel quality over time.
In [ ]:
# In this step I want to add another factor might affect the corrolation between score review and the numbers of review:
#days_since_review
#Total_Number_of_Reviews_Reviewer_Has_Given
In [166]:
review_data = uk_hotels_df.copy()

review_data['Review_Date'] = pd.to_datetime(review_data['Review_Date'])

# For days_since_review, using the max review date in the dataset as reference point
reference_date = review_data['Review_Date'].max()
review_data['days_since_review'] = (reference_date - review_data['Review_Date']).dt.days

#  Preparing Year_Week to match extended_weekly_summary
review_data['Year_Week'] = review_data['Review_Date'].dt.strftime('%Y-%U')

#  Grouping by Hotel_Name and Year_Week to compute weekly aggregates
aggregated_review_data = review_data.groupby(['Hotel_Name', 'Year_Week']).agg(
    Avg_Days_Since_Review=('days_since_review', 'mean'),
    Avg_Reviewer_Experience=('Total_Number_of_Reviews_Reviewer_Has_Given', 'mean')
).reset_index()

#  Merging these aggregates to the existing extended_weekly_summary
#   here we used aggregated ones for regression
enhanced_weekly_summary = extended_weekly_summary.merge(
    aggregated_review_data,
    on=['Hotel_Name', 'Year_Week'],
    how='left'
)

enhanced_weekly_summary.head()
Out[166]:
Hotel_Name Year_Week Reviews_This_Week Sum_Scores_This_Week Avg_Score_This_Week Cumulative_Reviews Cumulative_Sum_Scores Cumulative_Avg_Score Total_Positive_Words_This_Week Total_Negative_Words_This_Week Cumulative_Sum_Positive Cumulative_Sum_Negative Cumulative_Avg_Positive Cumulative_Avg_Negative Avg_Days_Since_Review Avg_Reviewer_Experience
0 11 Cadogan Gardens 2015-32 1 10.0 10.00 1 10.0 10.00 7 0 7 0 7.000000 0.000000 721.0 111.0
1 11 Cadogan Gardens 2015-35 1 10.0 10.00 2 20.0 10.00 51 0 58 0 29.000000 0.000000 704.0 4.0
2 11 Cadogan Gardens 2015-36 2 13.8 6.90 4 33.8 8.45 16 46 74 46 18.500000 11.500000 693.0 8.5
3 11 Cadogan Gardens 2015-37 1 9.2 9.20 5 43.0 8.60 2 15 76 61 15.200000 12.200000 688.0 8.0
4 11 Cadogan Gardens 2015-38 2 17.9 8.95 7 60.9 8.70 56 14 132 75 18.857143 10.714286 679.0 2.5
In [ ]:
# adding Avg_review_exprience(withou log) to time variation model (without fix effect).
In [167]:
regression_data = enhanced_weekly_summary.copy()

#  Cleaning data - remove zeros or negatives before log transformation
regression_data = regression_data[
    (regression_data['Cumulative_Reviews'] > 0) &
    (regression_data['Cumulative_Avg_Positive'] > 0) &
    (regression_data['Cumulative_Avg_Negative'] > 0)
].copy()

#  Sorting data for lagging
regression_data = regression_data.sort_values(['Hotel_Name', 'Year_Week'])

# Creating lagged sentiment variables
regression_data['Lagged_Cumulative_Avg_Positive'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
regression_data['Lagged_Cumulative_Avg_Negative'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

#  Droping rows with missing lagged values
regression_data = regression_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

#  Log-transform selected variables
regression_data['log_Cumulative_Reviews'] = np.log(regression_data['Cumulative_Reviews'])
regression_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(regression_data['Lagged_Cumulative_Avg_Positive'])
regression_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(regression_data['Lagged_Cumulative_Avg_Negative'])

Y = regression_data['Cumulative_Avg_Score']
X = regression_data[['log_Cumulative_Reviews',
                     'log_Lagged_Cumulative_Avg_Positive',
                     'log_Lagged_Cumulative_Avg_Negative',
                     'Avg_Reviewer_Experience']]
X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.613
Model:                              OLS   Adj. R-squared:                  0.613
Method:                   Least Squares   F-statistic:                 1.413e+04
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          11:38:25   Log-Likelihood:                -18524.
No. Observations:                 35654   AIC:                         3.706e+04
Df Residuals:                     35649   BIC:                         3.710e+04
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  7.9939      0.035    228.864      0.000       7.925       8.062
log_Cumulative_Reviews                 0.0049      0.002      2.713      0.007       0.001       0.008
log_Lagged_Cumulative_Avg_Positive     1.2260      0.009    130.861      0.000       1.208       1.244
log_Lagged_Cumulative_Avg_Negative    -1.0645      0.007   -154.155      0.000      -1.078      -1.051
Avg_Reviewer_Experience                0.0003      0.000      0.920      0.357      -0.000       0.001
==============================================================================
Omnibus:                     5142.331   Durbin-Watson:                   0.246
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            51114.002
Skew:                          -0.370   Prob(JB):                         0.00
Kurtosis:                       8.819   Cond. No.                         176.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# with log 
In [168]:
regression_data = enhanced_weekly_summary.copy()

regression_data = regression_data[
    (regression_data['Cumulative_Reviews'] > 0) &
    (regression_data['Cumulative_Avg_Positive'] > 0) &
    (regression_data['Cumulative_Avg_Negative'] > 0) &
    (regression_data['Avg_Reviewer_Experience'] > 0)  # Important for log
].copy()


regression_data = regression_data.sort_values(['Hotel_Name', 'Year_Week'])

regression_data['Lagged_Cumulative_Avg_Positive'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
regression_data['Lagged_Cumulative_Avg_Negative'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

regression_data = regression_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

#  Log-transform continuous predictors
regression_data['log_Cumulative_Reviews'] = np.log(regression_data['Cumulative_Reviews'])
regression_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(regression_data['Lagged_Cumulative_Avg_Positive'])
regression_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(regression_data['Lagged_Cumulative_Avg_Negative'])
regression_data['log_Avg_Reviewer_Experience'] = np.log(regression_data['Avg_Reviewer_Experience'])

Y = regression_data['Cumulative_Avg_Score']
X = regression_data[['log_Cumulative_Reviews',
                     'log_Lagged_Cumulative_Avg_Positive',
                     'log_Lagged_Cumulative_Avg_Negative',
                     'log_Avg_Reviewer_Experience']]

X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.613
Model:                              OLS   Adj. R-squared:                  0.613
Method:                   Least Squares   F-statistic:                 1.413e+04
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          11:38:28   Log-Likelihood:                -18524.
No. Observations:                 35654   AIC:                         3.706e+04
Df Residuals:                     35649   BIC:                         3.710e+04
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.0013      0.035    227.971      0.000       7.933       8.070
log_Cumulative_Reviews                 0.0049      0.002      2.714      0.007       0.001       0.008
log_Lagged_Cumulative_Avg_Positive     1.2266      0.009    131.007      0.000       1.208       1.245
log_Lagged_Cumulative_Avg_Negative    -1.0652      0.007   -154.230      0.000      -1.079      -1.052
log_Avg_Reviewer_Experience           -0.0033      0.003     -1.158      0.247      -0.009       0.002
==============================================================================
Omnibus:                     5161.456   Durbin-Watson:                   0.246
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            51408.294
Skew:                          -0.373   Prob(JB):                         0.00
Kurtosis:                       8.835   Cond. No.                         117.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# now adding Avg_Days_Since_Review
In [163]:
regression_data = enhanced_weekly_summary.copy()

regression_data = regression_data[
    (regression_data['Cumulative_Reviews'] > 0) &
    (regression_data['Cumulative_Avg_Positive'] > 0) &
    (regression_data['Cumulative_Avg_Negative'] > 0) &
    (regression_data['Avg_Reviewer_Experience'] > 0) &
    (regression_data['Avg_Days_Since_Review'] > 0)  # Important for log
].copy()

# Sorting data for lagging
regression_data = regression_data.sort_values(['Hotel_Name', 'Year_Week'])

#  Creating lagged sentiment variables
regression_data['Lagged_Cumulative_Avg_Positive'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
regression_data['Lagged_Cumulative_Avg_Negative'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

# Droping rows with missing lagged values
regression_data = regression_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])


regression_data['log_Cumulative_Reviews'] = np.log(regression_data['Cumulative_Reviews'])
regression_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(regression_data['Lagged_Cumulative_Avg_Positive'])
regression_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(regression_data['Lagged_Cumulative_Avg_Negative'])
regression_data['log_Avg_Reviewer_Experience'] = np.log(regression_data['Avg_Reviewer_Experience'])
regression_data['log_Avg_Days_Since_Review'] = np.log(regression_data['Avg_Days_Since_Review'])


Y = regression_data['Cumulative_Avg_Score']
X = regression_data[['log_Cumulative_Reviews',
                     'log_Lagged_Cumulative_Avg_Positive',
                     'log_Lagged_Cumulative_Avg_Negative',
                     'log_Avg_Reviewer_Experience',
                     'log_Avg_Days_Since_Review']]

X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.614
Model:                              OLS   Adj. R-squared:                  0.614
Method:                   Least Squares   F-statistic:                 1.136e+04
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          11:37:54   Log-Likelihood:                -18466.
No. Observations:                 35649   AIC:                         3.694e+04
Df Residuals:                     35643   BIC:                         3.700e+04
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.2208      0.041    201.870      0.000       8.141       8.301
log_Cumulative_Reviews                -0.0042      0.002     -2.115      0.034      -0.008      -0.000
log_Lagged_Cumulative_Avg_Positive     1.2147      0.009    129.007      0.000       1.196       1.233
log_Lagged_Cumulative_Avg_Negative    -1.0673      0.007   -154.706      0.000      -1.081      -1.054
log_Avg_Reviewer_Experience           -0.0012      0.003     -0.406      0.685      -0.007       0.004
log_Avg_Days_Since_Review             -0.0245      0.002    -10.580      0.000      -0.029      -0.020
==============================================================================
Omnibus:                     5252.262   Durbin-Watson:                   0.246
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            51299.987
Skew:                          -0.397   Prob(JB):                         0.00
Kurtosis:                       8.823   Cond. No.                         171.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# Now with fixed effect model 
In [169]:
panel_data = enhanced_weekly_summary.copy()

panel_data = panel_data[
    (panel_data['Cumulative_Reviews'] > 0) &
    (panel_data['Cumulative_Avg_Positive'] > 0) &
    (panel_data['Cumulative_Avg_Negative'] > 0) &
    (panel_data['Avg_Reviewer_Experience'] > 0) &
    (panel_data['Avg_Days_Since_Review'] > 0)
].copy()

panel_data = panel_data.sort_values(['Hotel_Name', 'Year_Week'])

panel_data['Lagged_Cumulative_Avg_Positive'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
panel_data['Lagged_Cumulative_Avg_Negative'] = panel_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

panel_data = panel_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

# Log-transform continuous predictors
panel_data['log_Cumulative_Reviews'] = np.log(panel_data['Cumulative_Reviews'])
panel_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(panel_data['Lagged_Cumulative_Avg_Positive'])
panel_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(panel_data['Lagged_Cumulative_Avg_Negative'])
panel_data['log_Avg_Reviewer_Experience'] = np.log(panel_data['Avg_Reviewer_Experience'])
panel_data['log_Avg_Days_Since_Review'] = np.log(panel_data['Avg_Days_Since_Review'])

#  Convertting Year_Week to datetime for proper panel index
panel_data['Year_Week_Date'] = pd.to_datetime(panel_data['Year_Week'] + '-1', format='%Y-%U-%w')

# Set panel data index (hotel, time)
panel_data = panel_data.set_index(['Hotel_Name', 'Year_Week_Date'])

Y = panel_data['Cumulative_Avg_Score']
X = panel_data[['log_Cumulative_Reviews',
                'log_Lagged_Cumulative_Avg_Positive',
                'log_Lagged_Cumulative_Avg_Negative',
                'log_Avg_Reviewer_Experience',
                'log_Avg_Days_Since_Review']]

X = sm.add_constant(X)

model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.1964
Estimator:                     PanelOLS   R-squared (Between):              0.2765
No. Observations:                 35649   R-squared (Within):               0.1747
Date:                  Thu, May 08 2025   R-squared (Overall):              0.2901
Time:                          11:38:42   Log-likelihood                 1.913e+04
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      1717.8
Entities:                           400   P-value                           0.0000
Avg Obs:                         89.123   Distribution:                 F(5,35141)
Min Obs:                         7.0000                                           
Max Obs:                         105.00   F-statistic (robust):             1717.8
                                          P-value                           0.0000
Time periods:                       104   Distribution:                 F(5,35141)
Avg Obs:                         342.78                                           
Min Obs:                         252.00                                           
Max Obs:                         495.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.9256     0.1040     76.215     0.0000      7.7218      8.1294
log_Cumulative_Reviews                 0.0492     0.0035     14.129     0.0000      0.0424      0.0560
log_Lagged_Cumulative_Avg_Positive     0.4033     0.0062     64.683     0.0000      0.3911      0.4155
log_Lagged_Cumulative_Avg_Negative    -0.3000     0.0044    -68.945     0.0000     -0.3085     -0.2914
log_Avg_Reviewer_Experience            0.0007     0.0010     0.6799     0.4966     -0.0013      0.0028
log_Avg_Days_Since_Review             -0.0085     0.0182    -0.4656     0.6415     -0.0442      0.0272
======================================================================================================

F-test for Poolability: 506.91
P-value: 0.0000
Distribution: F(502,35141)

Included effects: Entity, Time
In [ ]:
# 3- In the pooled OLS model, cumulative reviews slightly decrease scores, but in fixed effects, they increase scores, showing that within hotels, accumulating reviews boosts reputation.
# Positive and negative word counts remain highly significant in both models, affirming their strong impact on customer evaluations.
# However, the effect of positive and negative words is less extreme in fixed effects, as hotel-level language styles are absorbed by the fixed effects.
# Days since review is significant and negative in pooled OLS, reflecting older hotels with outdated feedback, but becomes insignificant in fixed effects since time variation is controlled.
# Overall, fixed effects better capture within-hotel dynamics, while pooled OLS reflects between-hotel differences, especially visible in cumulative reviews and review age effects.
In [ ]:
#In the next step I want to check for heterogenous effect with using tag column to see what type of trvel and room we have
In [185]:
#  Checking the first few examples to understand the structure
print(uk_hotels_df['Tags'].head(10))

#  Extracting unique tags
#cleaning and flatten 

# Removing brackets and split by comma
tags_series = uk_hotels_df['Tags'].dropna().str.replace(r"[\[\]']", '', regex=True).str.split(',')

# Flattening the list of lists and strip whitespaces
all_tags = tags_series.explode().str.strip()

# Getting unique tags
unique_tags = all_tags.unique()

print(f"Number of unique tags: {len(unique_tags)}")
print("Unique tags:")
for tag in unique_tags:
    print(f"- {tag}")
405    [' Leisure trip ', ' Group ', ' Classic Twin R...
406    [' Leisure trip ', ' Couple ', ' Classic Twin ...
407    [' Leisure trip ', ' Family with young childre...
408    [' Leisure trip ', ' Couple ', ' Classic Doubl...
409    [' Leisure trip ', ' Group ', ' Classic Twin R...
410    [' Leisure trip ', ' Couple ', ' Classic Twin ...
411    [' Leisure trip ', ' Group ', ' Classic Triple...
412    [' Leisure trip ', ' Couple ', ' Classic Doubl...
413    [' Leisure trip ', ' Group ', ' Classic Triple...
414    [' Leisure trip ', ' Couple ', ' Classic Doubl...
Name: Tags, dtype: object
Number of unique tags: 821
Unique tags:
- Leisure trip
- Group
- Classic Twin Room
- Stayed 3 nights
- Couple
- Stayed 2 nights
- Family with young children
- Executive King Room
- Stayed 6 nights
- Classic Double Room
- Stayed 4 nights
- Classic Triple Room
- Stayed 1 night
- Submitted from a mobile device
- Solo traveler
- Business trip
- Classic Single Room
- Stayed 5 nights
- Family with older children
- Stayed 7 nights
- 2 rooms
- Travelers with friends
- Stayed 11 nights
- Stayed 10 nights
- Stayed 9 nights
- With a pet
- Stayed 8 nights
- Classic King Room
- Deluxe King Room with Club Lounge Access and Balcony
- City King Room
- Superior King Room
- City Twin Room
- Deluxe King Room with Club Lounge Access
- Superior King Room with Balcony
- Master Suite with Club Lounge Access
- Junior Suite with Club Lounge Access
- Stayed 14 nights
- Stayed 12 nights
- Triple Room
- Compact Double Room
- Deluxe Double Room
- Club Double Room
- Deluxe Twin Room
- Junior Suite
- Stayed 16 nights
- Stayed 15 nights
- Stayed 13 nights
- Stayed 18 nights
- Quadruple Room
- Stayed 17 nights
- 3 rooms
- Double Room
- Twin Room
- Single Room
- Champagne Shopping package Double Room
- Superior Double Room
- Superior Double Room with Two Double Beds
- Treat your Partner Package Double Room
- 4 rooms
- Superior Twin Room
- Champagne Shopping package Twin Room
- Stayed 20 nights
- Executive Studio with London Eye View
- Studio Double Family
- Studio Double Room
- Junior Suite 3 Adults
- Superior Double Room with London Eye View
- Executive Studio City View Family
- Penthouse Suite Family
- Executive Studio City View
- Penthouse Suite
- Junior Suite 2 Adults
- Aldwych Double Twin Room
- Executive Double Room
- Executive Suite
- Studio Suite
- Superior Double or Twin Room
- Classic Double or Twin Room
- Deluxe King Room
- King Room
- Deluxe Double or Twin Room
- Executive Double or Twin Room
- Deluxe Single Room
- Deluxe Triple Room
- Club Twin Room
- Stayed 21 nights
- Superior Queen Room
- Master Suite
- Deluxe Studio
- Double Room with Sofa Bed Non Smoking
- Standard Room
- Executive Room Non Smoking
- Double Room Non Smoking
- Plaza Club Room
- Double Room with Two Double Beds
- Executive Twin Room
- Double Room Disability Access
- Superior Room with 2 Single Beds
- Superior Room
- Executive Room
- Executive Premium Room with a Queen Bed
- Luxury Junior Suite
- Deluxe Room
- Luxury Room
- Deluxe Junior Suite
- One Bedroom Suite
- Family Room
- Deluxe King Room with River View
- Club King Room with Lounge Access
- Deluxe King Room with Canary Wharf View
- Deluxe King Suite
- Superior King Room with River View
- Deluxe Suite with River View
- Double or Twin Room
- Knightsbridge Suite
- Loft
- Loft Suite
- Junior Suite 2 Adults 1 Child
- Double Room with Four Poster Bed
- Nadler Deluxe Double Room
- Small Double Room
- King Double or Twin Room
- Standard Double Room
- Superior King or Twin Room
- Suite
- Comfort Triple Room
- Luxury Double Room
- Luxury Suite
- Queen Room
- Junior Suite Attic
- Suite with Terrace
- Superior Room with Queen Bed and Sofa
- Standard Room with Queen Bed and Sofa
- Superior Room with Queen Bed Sofa
- Superior Room with Queen Bed
- Executive Room with Queen Bed
- Superior Premium Room with Queen Bed Sofa
- Executive Queen Room
- Wonderful Room
- Spectacular Room
- Fabulous Room
- Cool Corner Room
- Marvellous Suite
- Superior King Room with Sofa Bed
- Deluxe Double Room with Two Double Beds
- Club King Room
- Suite with Sofa Bed
- Executive Room Selected at Check In
- Superior Suite
- Queen Guest Room
- King Duplex Suite
- Twin Guest Room
- Queen Superior Room
- King Deluxe Family Room
- Queen Deluxe Room
- Twin Superior Room
- Double Guest Room
- King One Bedroom Suite
- King Guest Room
- Queensgate Double Room
- Queensgate Triple Room
- Duplex Suite
- Queensgate Twin Room
- Club Suite
- Club Deluxe Room
- Queen One Bedroom Suite
- Superior Deluxe Double Room
- Standard Suite
- Deluxe King Studio Suite
- Accessible Studio Suite
- Standard Single Room
- Standard Twin Room
- Classic Room
- Panoramic Room
- Deluxe Double Room with View
- Studio Room
- Stayed 25 nights
- Queen Room Non Smoking
- Twin Room Non Smoking
- Standard Double or Twin Room
- Executive King Room with Sofa Bed Non Smoking
- Queen Room Disability Access Non Smoking
- Executive King Room Non Smoking
- Deluxe Studio Suite
- One Bedroom Studio Suite
- Studio Disability Access
- Economy Single Room basement no window
- Comfort Single Room with Window and Double Bed
- Quadruple Room with Shower
- Two Bedroom Suite
- Triple Room Disability Access
- Economy Double Room basement no window
- King Suite
- Presidential Suite
- Junior Suite with Park View
- One Bedroom Suite with Park View
- Grand Deluxe Double Room
- Queen Double Room
- King Double Room
- Deluxe Queen Room
- Superior Queen Room with Double Sofa
- Superior Queen Room with Sofa
- Premium Superior Queen Room
- Superior Premium Queen Room
- Executive Queen Room with Sofa
- Executive Queen Room with Sofa Bed 3 Adults
- Room with Park View
- One Bedroom Apartment
- Green Park Suite
- Standard King Room
- Suite with City View
- Comfort Single Room with Shower
- Deluxe Double with Terrace
- Economy Double Room
- Executive Suite 2 Adults 2 Children
- Family Room 2 Adults 2 Children
- Executive Family Room
- Superior Triple Room
- Club Single Room
- Executive Triple Room
- Family Room Sleeps Four
- Ambassadors Room
- Prestige Double Room
- Superior Room with Two Double Beds and Balcony
- Deluxe with Two Double Beds
- Superior King with Balcony
- Deluxe King with Balcony
- Executive King
- Executive with Two Double Beds
- King Room with Sofa Bed and Balcony
- Queen Guestroom
- Deluxe King Guestroom with view
- Deluxe Queen Guestroom
- Double Hilton Guestroom
- Single Hilton Guestroom
- Deluxe King Guestroom
- Grand Superior Room
- Standard Guestroom
- Principle Double
- Super Deluxe Double Room
- Superior Single Room
- Business King Room
- Executive Double Room with Two Double Beds with Lounge Access
- Executive King Room with Lounge Access
- Superior King Room with Lounge Access
- Stayed 23 nights
- Stayed 29 nights
- Superior Double Room with City View
- Junior Suite with City View
- Luxury Family Room
- Deluxe King or Twin Room
- Executive King or Twin Room with Lounge Access and Park View
- Junior Suite with Lounge Access and City View
- King Room Disability Access Non Smoking
- Family Superior Room
- Double Executive Room with Lounge Access
- Terrace Suite with Lounge Access
- Double Room GWR Tower
- Tower Suite with Lounge Access
- Queen Suite with Lounge Access
- Deluxe Twin or Single Room
- King Executive Room with Lounge Access
- Single Room with Shower
- Delightful Queen Room
- Delightful King Twin Room
- Deluxe King Room with Garden View
- Superior Junior Suite
- Grand Suite
- Queen Room with Mobility Access
- Deluxe King Room with Balcony
- Deluxe Suite
- Deluxe Double
- Montcalm Club Twin Room
- Montcalm Club Double Room
- Deluxe Double and Single Room
- Junior Deluxe Double Room
- Junior Deluxe Twin Room
- Standard Compact Double Room
- Standard Queen Room
- Twin Room Disability Access
- King Room or Twin Room Disability Access
- Twin or Double Room with Terrace
- Standard Double Room without Window
- Stayed 28 nights
- Single Guest Room
- Twin Guest Room Plus
- Double Hilton Guest Room Plus
- Stayed 19 nights
- Executive Single Room
- Family Room with Bathroom
- King Size Double Room
- Executive Room with Two Queen Beds
- Family Suite
- Executive Double Room River View
- Stayed 27 nights
- Garden Room
- Mezzanine Suite
- King Magistrate Suite
- Family Room 3 Adults
- Standard Double Twin Room
- Business Double Room
- Club Queen Room
- King Studio
- Single Room with Double Bed
- Garden King Room
- Garden Twin Room
- Suite with Balcony
- King Guest Room Wheelchair Accessible
- Executive Twin Room with Lounge Access
- Junior Suite with Lounge Access
- One Bedroom Suite with Lounge Access
- Deluxe King Room Wheelchair Accessible
- Deluxe Junior Suite with Lounge Access
- Junior King Suite
- Double Family Guestroom
- Triple Family Guestroom
- Accessible Queen Guest Room
- Deluxe Premium King Room
- Family Room with Two Double Beds
- Deluxe Double Twin Room
- Deluxe King Plus
- Double Double Room
- King Room with View
- Deluxe Family Room
- Executive Club Double Room
- Executive Club Twin Room
- Stayed 24 nights
- Stayed 22 nights
- Club Double or Twin Room
- Three Bedroom Luxury Apartment
- Studio
- Deluxe Double Room with River View
- Deluxe Double Room with City View
- Executive Room with River View
- Executive Room with Garden View
- Deluxe Double Room with Garden View
- Executive Room City View
- Grand Executive Room
- Suite with River View
- Executive King Room with View
- Grand Executive Room with City View
- Standard Double Room with City View
- Premium King Room
- Premier Double Room
- Luxury Double Room Mezzanine
- Luxury Four Poster
- Luxury Twin Room Solarium
- Pasha Suite
- Luxury Twin Room
- Executive Four Poster Room
- Superior
- Standard
- Superior Double Double
- River View Balcony Suite
- Superior Double Room with Private Patio
- River View Deluxe
- Standard Double
- Luxury Studio Suite
- Suite Family Room
- Deluxe Double Room or Twin Room
- Club Room
- King Deluxe Room
- Junior King Suite with Lounge Access
- King Deluxe One Bedroom Suite with Lounge Access
- King Accessible Superior Room
- King Grand Deluxe Room
- King Superior Room
- Twin Deluxe Room
- Twin Grand Deluxe Room
- King One Bedroom Suite with Lounge Access
- Accessible King Room with Lounge Access
- Deluxe King Room with Water View
- Executive King or Twin Room with Lounge Access
- Studio Suite with Lounge Access
- Curve Suite with Lounge Access
- Executive Suite with Lounge Access
- King Guestroom
- Mayfair Suite with Lounge Access
- Twin Guestroom
- Executive Queen Room with Lounge Access
- Business Class Room
- Special Offer Double Room
- Landmark Junior Suite
- Twin Hilton Guest Room
- King Hilton Guest Room
- Superior Family Room
- Triple Hilton Family Room
- King Suite with Lounge Access
- Hilton Queen Family Room
- Octagon Suite with Lounge Access
- Comfort Double Room
- Superior Twin or Double Room
- Basic Double Room
- Executive Studio
- Comfort Quadruple Room
- Single Cabin Room
- Small Queen Room
- One Room Suite with Kitchenette
- Bunk Bed Room
- Premier King Room
- Grand Executive King Room
- Grand Premier Family Room
- Premier Twin Room
- Grand Premier King Room
- Grand Premier Suite
- Standard Apartment
- Deluxe Double Room with Four Poster Bed
- Superior One Bedroom Apartment
- Superior Studio Apartment
- Standard 1 Bedroom Apartment
- Deluxe Double Room with Separate Lounge Area
- Suite with Garden View
- Deluxe Queen Room with River View
- Junior Suite with River View and Balcony
- Deluxe Queen Room with Balcony
- One Bedroom King Suite
- Two Bedroom Family Room
- Ashburn Suite
- Stayed 30 nights
- Superior Four Poster Room
- Lower Ground Double Room
- Double Room with Shared Bathroom
- Single Room with Shared Bathroom
- Double Room with City View
- Double Room 1 Adult
- Double Room with City View 1 Adult
- Deluxe Double Room 1 Adult
- Gallery Suite
- Small Single Room
- Family Suite 2 Adults 2 Children
- Room Selected at Check In
- Superior Room with Queen size bed and Sofa Bed
- Executive Double Room with Double Bed and Sofa Bed
- Superior Room with Queen size bed and Sofa Bed 2 Adults
- Superior Room with Queen size Bed and Double Sofa Bed
- Four Poster Suite
- Double Room with Small Double Bed
- Double Room with View
- Executive Queen Room with Lounge Access Mobility Access
- Deluxe Queen Room with Sofa
- Junior Triple Room
- Regency King Suite with Lounge Access
- Twin Room with View
- Double or Twin Room with Garden View
- Club King Room with View and Lounge Access
- Club Twin Room with Lounge Access
- Regency Executive Suite with Lounge Access
- Junior Double Room
- Suite Room
- Double Suite
- Twin Suite
- Standard Family Room
- Club SuperKing Room
- Standard Triple Room
- Deluxe King Room with City View
- Premier King Room with City View
- Superior Shard King
- Iconic King Room With City View
- Premier Shard King Room
- Premier Twin Room with City View
- Premier Shard Suite
- Executive King Twin Room
- Director s Double Room
- Dalston King
- Magistrate King
- Xscape King
- Shoreditch Suite
- Hoxton Studio Suite
- Hoxton Family Suite
- Superior Double Room with Balcony and City View
- Luxury Double Room with Balcony and River View
- Suite 2 Adults 1 Child
- Suite 3 Adults
- Corner Suite 2 Adults 1 Child
- Superior Oversized Queen Room
- Queen Suite
- Superior King Room Non Smoking
- Emperor Suite
- Premium Deluxe Double Room
- Classic Queen Room
- Classic King Twin Room
- Spring Package Deluxe Double or Twin Room
- Superior Double
- Compact Double Room shared bathroom
- Deluxe Single Room with Shared Bathroom
- Apartment Suite
- Executive Studio Apartment 2 Adults 1 child
- Deluxe Studio Apartment 2 Adults
- Small Studio
- Classic Studio
- 5 rooms
- Duplex
- Cosy Double Room
- Executive Double Room with Air Conditioning
- Cosy Single Room
- Executive King Room with Air Conditioning
- Double Deluxe Suite
- King Guest Room with Sofa Bed
- Twin Executive Room
- Double Executive Room
- Double Junior Suite
- Standard Double Studio
- Single Studio
- Luxury 1 Bed Suite
- Executive Deluxe Studio Suite
- Standard Studio Suite
- Executive Studio Suite
- Royal Mews View King Room
- Royal Mews View Classic Double
- Executive King Suite
- Royal Double Room
- Executive King or Twin Room
- Family Queen Room
- Double Room with Sofa Bed
- Two Bedroom Suite with Terrace
- Large King Room
- Andaz Large Suite
- Andaz Double Room
- Andaz King Twin Room
- Andaz Large King Room
- Twin Room with Private Bathroom
- Compact Double Room with Private Bathroom
- Single Room with Private Bathroom
- Double Room with Private Bathroom
- Petite Double Room
- Executive Junior Suite
- Classic Junior Suite
- Classic Master Suite
- Executive Master Suite
- Junior Suite Split Level with Lounge Access
- Luxury Suite Conservatory with Lounge Access
- Studio 2 Adults
- Superior Twin Double Room
- Budget Single Room
- Deluxe Double Room with Balcony
- Park View Studio
- Deluxe
- Loft Junior Suite
- Deluxe with Garden
- Superior Room with 1 Queen Bed
- Superior Room with 1 Queen Bed 1 Sofa Bed
- Executive Room with 1 Queen bed
- Superior Queen Room with 2 Single Sofa Beds
- Superior Queen Room with Single Sofa Bed
- Premier River King Room
- Townhouse Deluxe
- Townhouse Club
- Townhouse Suite
- Townhouse Apartment
- Premier Single Room
- Two Bedroom Penthouse Apartment
- Stayed 26 nights
- Double Deluxe Room
- Twin Executive Room with Lounge Access
- Double Deluxe Plus Room
- Hilton Deluxe Twin Room Plus
- Superior Queen Suite with Sofa
- King or Queen Studio
- Loft Double Room with Terrace
- Loft Double Room
- Deluxe Two Bedroom Apartment
- Deluxe One Bedroom Suite
- Deluxe Double Studio
- Deluxe Twin Studio
- Superior One Bedroom Suite
- Superior Double Studio
- Cosy Twin Room
- Montcalm Club Room
- Standard Double Room No Window
- Exclusive Suite
- Executive Room with 1 Queen Bed and 1 Sofa 3 Adults
- Superior Premium Room with Queen Bed and Sofa Bed
- Superior Premium Queen Room with Sofa Bed
- Executive Premium Queen Room with Sofa Bed
- Executive Room with 1 Queen Bed and 1 Sofa 2 Adults
- Executive Deluxe King Room
- Stanhope Suite
- Belgravia Suite
- Double Superior Room
- Two Double Bed Guest Room
- Double Accessible Guest Room
- Double Hilton Deluxe
- King Executive Suite with Lounge Access
- Deluxe Double Room without Window
- Small Double Room without Window
- Suite without Window
- Dorsett Double Room
- Standard Quad Room
- Duke of Monmouth Suite
- Studio King 3 adults
- Studio King Family
- Studio Twin Room 3 adults
- Studio Twin Room
- Premium King Room with View
- Junior Suite Courtyard
- Double Room with Single Bed
- Deluxe Double plus Single Bed
- Executive Queen Room with Two Queen Beds with lounge Access
- One Bedroom King Suite with Sofa Bed
- Easter Special Executive King Room
- Club Level Classic Rooms
- Studio Twin
- Twin Room with Courtyard View
- King Room with Courtyard View
- Hyde Park Junior Suite with Park View
- King Room with Hyde Park View
- King Room with Knightsbridge View
- Mandarin Junior Suite
- One Bedroom Suite with Courtyard View
- Suite with Knightsbridge View
- Grand Superior King or Twin Room
- Superior King
- Grand Deluxe King or Twin Room
- Grand Junior Suite
- City King or Twin Room
- Junior Suite with Balcony
- One Bedroom Suite with Kitchenette
- Queen Room Mobility Access
- King Atrium Suite with Lounge Access
- Family Room with sofa bed
- Standard Queen Room with Sofa Bed
- Superior Queen Room with Sofa Bed
- Standard Double Room with Two Single Beds
- Executive Premium Queen Room
- Privilege Room with King bed
- Privilege Super King or Twin Room
- Privilege Family Room
- Cabin Single Room
- Superior Double Room with Balcony
- Trash City Suite
- Signature Luxury King with Balcony
- Squint Splash Suite
- Drop Dead Gorgeous Splash Suite
- Jimmie Martin Penthouse Suite
- Squint Penthouse Suite
- Double Room Courtyard
- Privilege Twin King Room
- Superior Studio
- One Bedroom Classic Suite
- Master Bedroom Suite
- Master Bedroom Double
- Spare Room Double
- Superior Double Bedroom
- Classic Double Room with Breakfast
- Budget Double Room Second Floor
- King Room Second Floor
- Budget Double Room
- Standard Guest Room
- One Bedroom Park Suite
- Two Bedroom Superior Suite
- Studio Deluxe Suite
- One Bedroom Deluxe Suite
- Three Bedroom Suite
- Two Bedroom Deluxe Suite
- Family Studio
- Double Studio
- Triple Studio
- Twin Studio
- Premier Deluxe Double Room
- Premier Deluxe Twin Room
- Business Studio Suite
- Single Room Lower Ground Floor
- Double Family Guest Room
- Two Bedroom Apartment
- Accessible Twin Guest Room
- King Junior Suite
- Double or Twin Room Non Smoking
- Executive King Room Non Smoking with Executive Lounge Access
- Double Room Disability Access Non Smoking
- Deluxe Double Room Non Smoking
- King Suite Non Smoking with Executive Lounge Access
- Basic Single Room
- Superior Double Twin Room
- Junior Suite with Executive Lounge Access
- The Level Executive Service Room
- Queen Hilton Guest Room
- Queen Hilton Superior Room
- Queen Hilton Deluxe
- Twin Hilton Deluxe
- Hilton Executive Suite with Lounge Access
- Twin Hilton Room Plus
- Standard Studio
- Deluxe Double Room with Two Single Beds
- Deluxe King Room Disability Access
- Studio Apartment
- Stayed 31 nights
- Falconers One Bedroom Suite
- Falconers Two Bedroom Suite
- Kings Deluxe Junior Suite
- Kings One Bedroom Suite
- Minsters Deluxe Two Bedroom Suite
- Kings Junior Suite
- Kings Three Bedroom Suite
- Minsters Deluxe One Bedroom Suite
- Deluxe Balcony Room
- Superior Garden View Room
- Standard Double Room Disability Access
- Superior Queen Single Room
- Standard Queen or Twin Room Lower Ground Floor
- Chelsea Double Suite
- Marina Double Suite
- Marina Twin Suite
- River Double Suite
- Chelsea Twin Suite
- River Twin Suite
- Luxury Triple Room
- Family Room 2 Adults 1 Child
- Deluxe Room Barlow Wing
- Chambers Junior Suite with Club Lounge Access
- Superior King Room Barlow Wing
- Premier King Room Barlow Wing
- Superior Twin Room Barlow Wing
- Chambers Grand Junior Suite with Club Lounge Access
- One Bedroom Queen Suite
- Queen Guest Room with Sofa Bed
- Accessible Twin Room
- Deluxe Accessible Twin Room with View
- Queen Suite with Balcony and City View
- Deluxe Twin Room with View
- Deluxe Queen Room with View
- Queen Suite with City View
- Large Double or Twin Room
- Deluxe Guest Room
- One Bedroom Suite with Executive Lounge Access
- Executive King Room with Executive Lounge Access
- Deluxe Room with view
- Superior Room with Two Double Beds and View
- Balcony Suite with Executive Lounge Access
- Tiny Single Room
- Luxury One Bedroom Apartment
- Luxury Two Bedroom Apartment
- Twin Hilton Deluxe Room
- Hilton Superior Twin Room
- Hilton Deluxe King Room
- Double Hilton Deluxe Room
- Hilton Superior Double Room
- Hilton Superior King Room
- City Room
- Halkin Room
- Superior Room with King Bed
- Westminster Suite with King Bed
- Junior Conservatory with King Bed
- Double Room with Garden View
- Double Room with Street View
- Executive Double Room with Lounge Access
- Single Hilton Guest Room
- One Bedroom Apsley Suite
- Premier Room
- Accessible Double Guest Room
- Family Room with King Bed
- Deluxe King Room with Stadium View
- Executive Queen Room Mobility Access with Lounge Access
- Corner Suite with Lounge Access
- Executive Double
- Junior Suite Basement
- Classic Twin Room with View
- Classic Double Room with View
- Superior Double Room with Park View
- Superior Twin Room with City View
- Superior Twin Room with Park View
- Executive Double Twin City View
- Deluxe Double or Twin Room with City View
- Superior Room with View of Big Ben
- King or Twin Room with River View
- Executive Double or Twin Room with view of Big Ben and Lounge Access
- Executive Room with View and Lounge Access
- One Bedroom Executive Suite with Lounge Access
- Superior Plus Room
- Deluxe Plus Double Room
- Deluxe Metropolitan City Double Room
- Superior Metropolitan Park Double Room
- Deluxe One Bedroom Park Suite
- One Bedroom City Suite
- King Room with River View
- Deluxe Suite with Executive Lounge Access
- Executive King Suite with Lounge Access
- Superior One Bedroom Suite with Executive Lounge Access
- The Rooks Nest
- Deluxe Room with Two Double Beds
- Executive Room with Two Double Beds and Executive Lounge Access
- Grand Executive Club Room
- Double Room Medium
- Double Room Cosy
- Double Room Crash Pad
- Double Room Medium Heritage
- Double Room Crash Pad Under 30s
- Double Room large
- Double Room Large Heritage
- Junior Double
- Embassy Suite
- K Suite
- Queen Room with Sofa Bed
- Superior Plus Twin Room
- Superior Plus Double Room
- King Junior Suite with Lounge Access
- Standard Double Room with Bridge View
- Standard Twin Room with Bridge View
- Luxury King Room
- Two Bedroom Family
- Luxury King Room with River View
- Deluxe Junior Partial River View
- One Bedroom River View Suite
- One Bed Feature Room
- Double Room Disability Access Tub
- Double Room with Terrace
- Deluxe Club Double Room
- Deluxe Double Room with Lounge Access
- Club Double Room with Lounge Access
- Double Room with River View
- Junior Suite with River View
- Studio King
- Superior Twin Room with Internal View
- Superior Double Room Disability Access
- Superior Double Room with Internal View
- Studio Triple
- One Bedroom Family Suite
- Deluxe Park View
- Standard Double Room with Park View
- Standard King Room Non Smoking
- King Room Mobility Accessible Non Smoking
- Executive Club Room
In [186]:
#  Making a clean copy to avoid SettingWithCopyWarning
uk_hotels_df = uk_hotels_df.copy()

#  Cleaning the tags
uk_hotels_df['Clean_Tags'] = uk_hotels_df['Tags'].str.replace(r"[\[\]']", '', regex=True).str.split(',')

#  Creating dummies at review level
uk_hotels_df['is_business_trip'] = uk_hotels_df['Clean_Tags'].apply(
    lambda tags: int('Business trip' in [tag.strip() for tag in tags]) if isinstance(tags, list) else 0
)

uk_hotels_df['is_leisure_trip'] = uk_hotels_df['Clean_Tags'].apply(
    lambda tags: int('Leisure trip' in [tag.strip() for tag in tags]) if isinstance(tags, list) else 0
)
In [187]:
#  Extracting year-week
uk_hotels_df['Review_Date'] = pd.to_datetime(uk_hotels_df['Review_Date'])
uk_hotels_df['Year_Week'] = uk_hotels_df['Review_Date'].dt.strftime('%Y-%U')
In [188]:
#  Grouping by hotel-week
tags_summary = uk_hotels_df.groupby(['Hotel_Name', 'Year_Week']).agg(
    total_reviews=('Review_Date', 'count'),
    total_business=('is_business_trip', 'sum'),
    total_leisure=('is_leisure_trip', 'sum')
).reset_index()

#  Computing shares
tags_summary['Share_Business_Trip'] = tags_summary['total_business'] / tags_summary['total_reviews']
tags_summary['Share_Leisure_Trip'] = tags_summary['total_leisure'] / tags_summary['total_reviews']


tags_summary.head()
Out[188]:
Hotel_Name Year_Week total_reviews total_business total_leisure Share_Business_Trip Share_Leisure_Trip
0 11 Cadogan Gardens 2015-32 1 0 1 0.0 1.0
1 11 Cadogan Gardens 2015-35 1 0 1 0.0 1.0
2 11 Cadogan Gardens 2015-36 2 0 2 0.0 1.0
3 11 Cadogan Gardens 2015-37 1 1 0 1.0 0.0
4 11 Cadogan Gardens 2015-38 2 0 2 0.0 1.0
In [175]:
#  a table in which it showing wether the share of business trip was high or Lesuire trip.
In [189]:
# Merge with enhanced_weekly_summary
enhanced_weekly_summary = enhanced_weekly_summary.merge(
    tags_summary[['Hotel_Name', 'Year_Week', 'Share_Business_Trip', 'Share_Leisure_Trip']],
    on=['Hotel_Name', 'Year_Week'],
    how='left'  
)


enhanced_weekly_summary.head()
Out[189]:
Hotel_Name Year_Week Reviews_This_Week Sum_Scores_This_Week Avg_Score_This_Week Cumulative_Reviews Cumulative_Sum_Scores Cumulative_Avg_Score Total_Positive_Words_This_Week Total_Negative_Words_This_Week ... Cumulative_Avg_Positive Cumulative_Avg_Negative Avg_Days_Since_Review Avg_Reviewer_Experience Share_Business_Trip_x Share_Leisure_Trip_x Share_Business_Trip_y Share_Leisure_Trip_y Share_Business_Trip Share_Leisure_Trip
0 11 Cadogan Gardens 2015-32 1 10.0 10.00 1 10.0 10.00 7 0 ... 7.000000 0.000000 721.0 111.0 0.0 1.0 0.0 1.0 0.0 1.0
1 11 Cadogan Gardens 2015-35 1 10.0 10.00 2 20.0 10.00 51 0 ... 29.000000 0.000000 704.0 4.0 0.0 1.0 0.0 1.0 0.0 1.0
2 11 Cadogan Gardens 2015-36 2 13.8 6.90 4 33.8 8.45 16 46 ... 18.500000 11.500000 693.0 8.5 0.0 1.0 0.0 1.0 0.0 1.0
3 11 Cadogan Gardens 2015-37 1 9.2 9.20 5 43.0 8.60 2 15 ... 15.200000 12.200000 688.0 8.0 1.0 0.0 1.0 0.0 1.0 0.0
4 11 Cadogan Gardens 2015-38 2 17.9 8.95 7 60.9 8.70 56 14 ... 18.857143 10.714286 679.0 2.5 0.0 1.0 0.0 1.0 0.0 1.0

5 rows × 22 columns

In [190]:
print(enhanced_weekly_summary.columns.tolist())
['Hotel_Name', 'Year_Week', 'Reviews_This_Week', 'Sum_Scores_This_Week', 'Avg_Score_This_Week', 'Cumulative_Reviews', 'Cumulative_Sum_Scores', 'Cumulative_Avg_Score', 'Total_Positive_Words_This_Week', 'Total_Negative_Words_This_Week', 'Cumulative_Sum_Positive', 'Cumulative_Sum_Negative', 'Cumulative_Avg_Positive', 'Cumulative_Avg_Negative', 'Avg_Days_Since_Review', 'Avg_Reviewer_Experience', 'Share_Business_Trip_x', 'Share_Leisure_Trip_x', 'Share_Business_Trip_y', 'Share_Leisure_Trip_y', 'Share_Business_Trip', 'Share_Leisure_Trip']
In [182]:
# I first check their effect by interation term for time variation model (without fixed effct). Just for business trip
In [191]:
regression_data = enhanced_weekly_summary.copy()

#  Droping rows with zero or negative values before log-transform
regression_data = regression_data[
    (regression_data['Cumulative_Reviews'] > 0) &
    (regression_data['Cumulative_Avg_Positive'] > 0) &
    (regression_data['Cumulative_Avg_Negative'] > 0) &
    (regression_data['Avg_Reviewer_Experience'] > 0) &
    (regression_data['Avg_Days_Since_Review'] > 0)
].copy()

# Sortting and creating lagged variables
regression_data = regression_data.sort_values(['Hotel_Name', 'Year_Week'])
regression_data['Lagged_Cumulative_Avg_Positive'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Positive'].shift(1)
regression_data['Lagged_Cumulative_Avg_Negative'] = regression_data.groupby('Hotel_Name')['Cumulative_Avg_Negative'].shift(1)

#  Droping rows with missing lagged values
regression_data = regression_data.dropna(subset=['Lagged_Cumulative_Avg_Positive', 'Lagged_Cumulative_Avg_Negative'])

#  Log-transform main variables
regression_data['log_Cumulative_Reviews'] = np.log(regression_data['Cumulative_Reviews'])
regression_data['log_Lagged_Cumulative_Avg_Positive'] = np.log(regression_data['Lagged_Cumulative_Avg_Positive'])
regression_data['log_Lagged_Cumulative_Avg_Negative'] = np.log(regression_data['Lagged_Cumulative_Avg_Negative'])
regression_data['log_Avg_Reviewer_Experience'] = np.log(regression_data['Avg_Reviewer_Experience'])
regression_data['log_Avg_Days_Since_Review'] = np.log(regression_data['Avg_Days_Since_Review'])

#  Creating interaction terms with Share_Business_Trip
regression_data['interaction_CumReviews_Business'] = regression_data['log_Cumulative_Reviews'] * regression_data['Share_Business_Trip']
regression_data['interaction_Positive_Business'] = regression_data['log_Lagged_Cumulative_Avg_Positive'] * regression_data['Share_Business_Trip']
regression_data['interaction_Negative_Business'] = regression_data['log_Lagged_Cumulative_Avg_Negative'] * regression_data['Share_Business_Trip']


Y = regression_data['Cumulative_Avg_Score']
X = regression_data[[
    'log_Cumulative_Reviews',
    'log_Lagged_Cumulative_Avg_Positive',
    'log_Lagged_Cumulative_Avg_Negative',
    'log_Avg_Reviewer_Experience',
    'log_Avg_Days_Since_Review',
    'interaction_CumReviews_Business',
    'interaction_Positive_Business',
    'interaction_Negative_Business'
]]

X = sm.add_constant(X)

#  OLS regression
model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.623
Model:                              OLS   Adj. R-squared:                  0.623
Method:                   Least Squares   F-statistic:                     7366.
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          12:47:29   Log-Likelihood:                -18062.
No. Observations:                 35649   AIC:                         3.614e+04
Df Residuals:                     35640   BIC:                         3.622e+04
Df Model:                             8                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.2286      0.040    204.258      0.000       8.150       8.308
log_Cumulative_Reviews                 0.0087      0.002      3.583      0.000       0.004       0.013
log_Lagged_Cumulative_Avg_Positive     1.1632      0.010    114.985      0.000       1.143       1.183
log_Lagged_Cumulative_Avg_Negative    -1.0315      0.008   -132.809      0.000      -1.047      -1.016
log_Avg_Reviewer_Experience            0.0087      0.003      3.077      0.002       0.003       0.014
log_Avg_Days_Since_Review             -0.0246      0.002    -10.761      0.000      -0.029      -0.020
interaction_CumReviews_Business       -0.0862      0.008    -11.291      0.000      -0.101      -0.071
interaction_Positive_Business          0.2024      0.019     10.583      0.000       0.165       0.240
interaction_Negative_Business         -0.1325      0.020     -6.739      0.000      -0.171      -0.094
==============================================================================
Omnibus:                     5387.932   Durbin-Watson:                   0.284
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            53663.963
Skew:                          -0.413   Prob(JB):                         0.00
Kurtosis:                       8.954   Cond. No.                         173.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# now with fixed effect 
In [192]:
regression_data = regression_data.copy()

regression_data['Year_Week_Date'] = pd.to_datetime(regression_data['Year_Week'] + '-1', format='%Y-%W-%w')

panel_data = regression_data.set_index(['Hotel_Name', 'Year_Week_Date'])
Y = panel_data['Cumulative_Avg_Score']
X = panel_data[[
    'log_Cumulative_Reviews',
    'log_Lagged_Cumulative_Avg_Positive',
    'log_Lagged_Cumulative_Avg_Negative',
    'log_Avg_Reviewer_Experience',
    'log_Avg_Days_Since_Review',
    'interaction_CumReviews_Business',
    'interaction_Positive_Business',
    'interaction_Negative_Business'
]]

X = sm.add_constant(X)

# fixed effects regression (hotel and time fixed effects)
model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.1970
Estimator:                     PanelOLS   R-squared (Between):              0.2780
No. Observations:                 35649   R-squared (Within):               0.1750
Date:                  Thu, May 08 2025   R-squared (Overall):              0.2916
Time:                          12:47:36   Log-likelihood                 1.914e+04
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      1077.8
Entities:                           400   P-value                           0.0000
Avg Obs:                         89.123   Distribution:                 F(8,35138)
Min Obs:                         7.0000                                           
Max Obs:                         105.00   F-statistic (robust):             1077.8
                                          P-value                           0.0000
Time periods:                       104   Distribution:                 F(8,35138)
Avg Obs:                         342.78                                           
Min Obs:                         252.00                                           
Max Obs:                         495.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.9337     0.1040     76.305     0.0000      7.7299      8.1375
log_Cumulative_Reviews                 0.0476     0.0035     13.408     0.0000      0.0406      0.0545
log_Lagged_Cumulative_Avg_Positive     0.4020     0.0064     63.002     0.0000      0.3895      0.4145
log_Lagged_Cumulative_Avg_Negative    -0.2962     0.0045    -65.101     0.0000     -0.3051     -0.2873
log_Avg_Reviewer_Experience            0.0013     0.0011     1.1872     0.2352     -0.0008      0.0033
log_Avg_Days_Since_Review             -0.0093     0.0182    -0.5122     0.6085     -0.0450      0.0264
interaction_CumReviews_Business        0.0052     0.0028     1.8412     0.0656     -0.0003      0.0108
interaction_Positive_Business          0.0056     0.0071     0.7950     0.4266     -0.0082      0.0195
interaction_Negative_Business         -0.0201     0.0072    -2.7688     0.0056     -0.0343     -0.0059
======================================================================================================

F-test for Poolability: 494.37
P-value: 0.0000
Distribution: F(502,35138)

Included effects: Entity, Time
In [61]:
#Share_Leisure_Trip without fix effect 
In [193]:
regression_data['interaction_CumReviews_Leisure'] = regression_data['log_Cumulative_Reviews'] * regression_data['Share_Leisure_Trip']
regression_data['interaction_Positive_Leisure'] = regression_data['log_Lagged_Cumulative_Avg_Positive'] * regression_data['Share_Leisure_Trip']
regression_data['interaction_Negative_Leisure'] = regression_data['log_Lagged_Cumulative_Avg_Negative'] * regression_data['Share_Leisure_Trip']

Y = regression_data['Cumulative_Avg_Score']
X = regression_data[[
    'log_Cumulative_Reviews',
    'log_Lagged_Cumulative_Avg_Positive',
    'log_Lagged_Cumulative_Avg_Negative',
    'log_Avg_Reviewer_Experience',
    'log_Avg_Days_Since_Review',
    'interaction_CumReviews_Business',
    'interaction_Positive_Business',
    'interaction_Negative_Business',
    'interaction_CumReviews_Leisure',
    'interaction_Positive_Leisure',
    'interaction_Negative_Leisure'
]]

X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.625
Model:                              OLS   Adj. R-squared:                  0.625
Method:                   Least Squares   F-statistic:                     5398.
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          12:47:38   Log-Likelihood:                -17977.
No. Observations:                 35649   AIC:                         3.598e+04
Df Residuals:                     35637   BIC:                         3.608e+04
Df Model:                            11                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.2436      0.040    205.015      0.000       8.165       8.322
log_Cumulative_Reviews                -0.0365      0.017     -2.212      0.027      -0.069      -0.004
log_Lagged_Cumulative_Avg_Positive     1.3626      0.043     31.723      0.000       1.278       1.447
log_Lagged_Cumulative_Avg_Negative    -1.2326      0.043    -28.946      0.000      -1.316      -1.149
log_Avg_Reviewer_Experience            0.0083      0.003      2.914      0.004       0.003       0.014
log_Avg_Days_Since_Review             -0.0251      0.002    -10.999      0.000      -0.030      -0.021
interaction_CumReviews_Business       -0.0396      0.018     -2.182      0.029      -0.075      -0.004
interaction_Positive_Business         -0.0043      0.046     -0.092      0.927      -0.095       0.087
interaction_Negative_Business          0.0703      0.047      1.513      0.130      -0.021       0.161
interaction_CumReviews_Leisure         0.0467      0.017      2.739      0.006       0.013       0.080
interaction_Positive_Leisure          -0.2124      0.044     -4.876      0.000      -0.298      -0.127
interaction_Negative_Leisure           0.2108      0.043      4.847      0.000       0.126       0.296
==============================================================================
Omnibus:                     5382.841   Durbin-Watson:                   0.292
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            53858.324
Skew:                          -0.410   Prob(JB):                         0.00
Kurtosis:                       8.965   Cond. No.                         485.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# just leisure 
In [194]:
regression_data['interaction_CumReviews_Leisure'] = regression_data['log_Cumulative_Reviews'] * regression_data['Share_Leisure_Trip']
regression_data['interaction_Positive_Leisure'] = regression_data['log_Lagged_Cumulative_Avg_Positive'] * regression_data['Share_Leisure_Trip']
regression_data['interaction_Negative_Leisure'] = regression_data['log_Lagged_Cumulative_Avg_Negative'] * regression_data['Share_Leisure_Trip']

Y = regression_data['Cumulative_Avg_Score']
X = regression_data[[
    'log_Cumulative_Reviews',
    'log_Lagged_Cumulative_Avg_Positive',
    'log_Lagged_Cumulative_Avg_Negative',
    'log_Avg_Reviewer_Experience',
    'log_Avg_Days_Since_Review',
    'interaction_CumReviews_Leisure',
    'interaction_Positive_Leisure',
    'interaction_Negative_Leisure'
]]

X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.625
Model:                              OLS   Adj. R-squared:                  0.625
Method:                   Least Squares   F-statistic:                     7421.
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          12:47:43   Log-Likelihood:                -17980.
No. Observations:                 35649   AIC:                         3.598e+04
Df Residuals:                     35640   BIC:                         3.605e+04
Df Model:                             8                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.2429      0.040    205.068      0.000       8.164       8.322
log_Cumulative_Reviews                -0.0701      0.006    -11.813      0.000      -0.082      -0.058
log_Lagged_Cumulative_Avg_Positive     1.3594      0.016     82.672      0.000       1.327       1.392
log_Lagged_Cumulative_Avg_Negative    -1.1729      0.016    -73.930      0.000      -1.204      -1.142
log_Avg_Reviewer_Experience            0.0082      0.003      2.895      0.004       0.003       0.014
log_Avg_Days_Since_Review             -0.0251      0.002    -11.024      0.000      -0.030      -0.021
interaction_CumReviews_Leisure         0.0803      0.007     11.234      0.000       0.066       0.094
interaction_Positive_Leisure          -0.2091      0.018    -11.695      0.000      -0.244      -0.174
interaction_Negative_Leisure           0.1514      0.018      8.251      0.000       0.115       0.187
==============================================================================
Omnibus:                     5378.071   Durbin-Watson:                   0.292
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            53705.398
Skew:                          -0.410   Prob(JB):                         0.00
Kurtosis:                       8.957   Cond. No.                         199.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# fix effect with lesire 
In [195]:
regression_data['Year_Week'] = pd.to_datetime(regression_data['Year_Week'] + '-1', format='%Y-%W-%w')

panel_data = regression_data.set_index(['Hotel_Name', 'Year_Week'])

Y = panel_data['Cumulative_Avg_Score']
X = panel_data[[
    'log_Cumulative_Reviews',
    'log_Lagged_Cumulative_Avg_Positive',
    'log_Lagged_Cumulative_Avg_Negative',
    'log_Avg_Reviewer_Experience',
    'log_Avg_Days_Since_Review',
    'interaction_CumReviews_Leisure',
    'interaction_Positive_Leisure',
    'interaction_Negative_Leisure'
]]


X = sm.add_constant(X)


model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.1971
Estimator:                     PanelOLS   R-squared (Between):              0.2783
No. Observations:                 35649   R-squared (Within):               0.1750
Date:                  Thu, May 08 2025   R-squared (Overall):              0.2918
Time:                          12:47:47   Log-likelihood                 1.914e+04
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      1078.1
Entities:                           400   P-value                           0.0000
Avg Obs:                         89.123   Distribution:                 F(8,35138)
Min Obs:                         7.0000                                           
Max Obs:                         105.00   F-statistic (robust):             1078.1
                                          P-value                           0.0000
Time periods:                       104   Distribution:                 F(8,35138)
Avg Obs:                         342.78                                           
Min Obs:                         252.00                                           
Max Obs:                         495.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.9349     0.1040     76.317     0.0000      7.7311      8.1387
log_Cumulative_Reviews                 0.0533     0.0040     13.376     0.0000      0.0455      0.0611
log_Lagged_Cumulative_Avg_Positive     0.4048     0.0081     50.143     0.0000      0.3889      0.4206
log_Lagged_Cumulative_Avg_Negative    -0.3141     0.0069    -45.524     0.0000     -0.3277     -0.3006
log_Avg_Reviewer_Experience            0.0012     0.0011     1.1188     0.2632     -0.0009      0.0032
log_Avg_Days_Since_Review             -0.0094     0.0182    -0.5159     0.6059     -0.0451      0.0263
interaction_CumReviews_Leisure        -0.0062     0.0027    -2.3214     0.0203     -0.0114     -0.0010
interaction_Positive_Leisure          -0.0022     0.0066    -0.3257     0.7447     -0.0152      0.0108
interaction_Negative_Leisure           0.0180     0.0068     2.6625     0.0078      0.0048      0.0313
======================================================================================================

F-test for Poolability: 491.81
P-value: 0.0000
Distribution: F(502,35138)

Included effects: Entity, Time
In [ ]:
#now instead of using interaction term , I would splitting the data to run two rgerssion : one for lesuire and another for bussines 
In [196]:
# Leisure dominant data
leisure_data = regression_data[regression_data['Share_Leisure_Trip'] > 0.5]

# Business dominant data
business_data = regression_data[regression_data['Share_Business_Trip'] > 0.5]
In [197]:
def run_ols_regression(data, title):
    # Define dependent and independent variables
    Y = data['Cumulative_Avg_Score']
    X = data[[
        'log_Cumulative_Reviews',
        'log_Lagged_Cumulative_Avg_Positive',
        'log_Lagged_Cumulative_Avg_Negative',
        'log_Avg_Reviewer_Experience',
        'log_Avg_Days_Since_Review'
    ]]

   
    X = sm.add_constant(X)

    
    model = sm.OLS(Y, X).fit()

    
    print(f"\n==== OLS Results for {title} ====\n")
    print(model.summary())

# for Leisure data
run_ols_regression(leisure_data, "Leisure-Dominant Weeks")

# for Business data
run_ols_regression(business_data, "Business-Dominant Weeks")
==== OLS Results for Leisure-Dominant Weeks ====

                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.620
Model:                              OLS   Adj. R-squared:                  0.620
Method:                   Least Squares   F-statistic:                     9757.
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          12:47:55   Log-Likelihood:                -14682.
No. Observations:                 29956   AIC:                         2.938e+04
Df Residuals:                     29950   BIC:                         2.943e+04
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.1146      0.044    183.360      0.000       8.028       8.201
log_Cumulative_Reviews                -0.0052      0.002     -2.406      0.016      -0.009      -0.001
log_Lagged_Cumulative_Avg_Positive     1.2400      0.010    120.574      0.000       1.220       1.260
log_Lagged_Cumulative_Avg_Negative    -1.0524      0.008   -140.288      0.000      -1.067      -1.038
log_Avg_Reviewer_Experience           -0.0019      0.003     -0.593      0.553      -0.008       0.004
log_Avg_Days_Since_Review             -0.0212      0.002     -8.727      0.000      -0.026      -0.016
==============================================================================
Omnibus:                     4340.541   Durbin-Watson:                   0.270
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            42711.711
Skew:                          -0.378   Prob(JB):                         0.00
Kurtosis:                       8.801   Cond. No.                         176.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

==== OLS Results for Business-Dominant Weeks ====

                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.606
Model:                              OLS   Adj. R-squared:                  0.605
Method:                   Least Squares   F-statistic:                     697.4
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          12:47:55   Log-Likelihood:                -1507.1
No. Observations:                  2272   AIC:                             3026.
Df Residuals:                      2266   BIC:                             3061.
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  9.1418      0.169     54.130      0.000       8.811       9.473
log_Cumulative_Reviews                -0.0722      0.010     -7.485      0.000      -0.091      -0.053
log_Lagged_Cumulative_Avg_Positive     1.1149      0.036     30.857      0.000       1.044       1.186
log_Lagged_Cumulative_Avg_Negative    -1.0813      0.027    -40.192      0.000      -1.134      -1.029
log_Avg_Reviewer_Experience            0.0161      0.010      1.628      0.104      -0.003       0.035
log_Avg_Days_Since_Review             -0.0988      0.012     -7.943      0.000      -0.123      -0.074
==============================================================================
Omnibus:                      371.542   Durbin-Watson:                   0.608
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2798.078
Skew:                          -0.553   Prob(JB):                         0.00
Kurtosis:                       8.323   Cond. No.                         149.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# with fixed effect 
In [200]:
# Step 2: Spliting the data
leisure_data = regression_data[regression_data['Share_Leisure_Trip'] > 0.5]
business_data = regression_data[regression_data['Share_Business_Trip'] > 0.5]

#  Defining reusable function for fixed effect regression
def run_fixed_effects(data, title):
    
    panel_data = data.set_index(['Hotel_Name', 'Year_Week'])

   
    Y = panel_data['Cumulative_Avg_Score']
    X = panel_data[[
        'log_Cumulative_Reviews',
        'log_Lagged_Cumulative_Avg_Positive',
        'log_Lagged_Cumulative_Avg_Negative',
        'log_Avg_Reviewer_Experience',
        'log_Avg_Days_Since_Review'
    ]]

    
    X = sm.add_constant(X)

    #  PanelOLS with hotel and time fixed effects
    model = PanelOLS(Y, X, entity_effects=True, time_effects=True)
    results = model.fit()

   
    print(f"\n=== Fixed Effects Results for {title} ===\n")
    print(results.summary)

run_fixed_effects(leisure_data, "Leisure-Dominant Weeks")

run_fixed_effects(business_data, "Business-Dominant Weeks")
=== Fixed Effects Results for Leisure-Dominant Weeks ===

                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.2074
Estimator:                     PanelOLS   R-squared (Between):              0.2904
No. Observations:                 29956   R-squared (Within):               0.1870
Date:                  Thu, May 08 2025   R-squared (Overall):              0.3001
Time:                          13:01:41   Log-likelihood                 1.728e+04
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      1541.3
Entities:                           400   P-value                           0.0000
Avg Obs:                         74.890   Distribution:                 F(5,29448)
Min Obs:                         4.0000                                           
Max Obs:                         105.00   F-statistic (robust):             1541.3
                                          P-value                           0.0000
Time periods:                       104   Distribution:                 F(5,29448)
Avg Obs:                         288.04                                           
Min Obs:                         231.00                                           
Max Obs:                         445.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.9358     0.1089     72.880     0.0000      7.7224      8.1492
log_Cumulative_Reviews                 0.0446     0.0038     11.774     0.0000      0.0372      0.0520
log_Lagged_Cumulative_Avg_Positive     0.4185     0.0068     61.609     0.0000      0.4052      0.4319
log_Lagged_Cumulative_Avg_Negative    -0.3032     0.0047    -64.807     0.0000     -0.3124     -0.2941
log_Avg_Reviewer_Experience            0.0022     0.0012     1.8512     0.0641     -0.0001      0.0044
log_Avg_Days_Since_Review             -0.0090     0.0190    -0.4753     0.6345     -0.0463      0.0282
======================================================================================================

F-test for Poolability: 436.95
P-value: 0.0000
Distribution: F(502,29448)

Included effects: Entity, Time

=== Fixed Effects Results for Business-Dominant Weeks ===

                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.1508
Estimator:                     PanelOLS   R-squared (Between):              0.1685
No. Observations:                  2272   R-squared (Within):              -0.1562
Date:                  Thu, May 08 2025   R-squared (Overall):              0.1876
Time:                          13:01:42   Log-likelihood                    1103.7
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      64.344
Entities:                           353   P-value                           0.0000
Avg Obs:                         6.4363   Distribution:                  F(5,1811)
Min Obs:                         1.0000                                           
Max Obs:                         45.000   F-statistic (robust):             64.344
                                          P-value                           0.0000
Time periods:                       104   Distribution:                  F(5,1811)
Avg Obs:                         21.846                                           
Min Obs:                         5.0000                                           
Max Obs:                         41.000                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.9539     0.6492     12.251     0.0000      6.6806      9.2272
log_Cumulative_Reviews                 0.0856     0.0167     5.1137     0.0000      0.0528      0.1184
log_Lagged_Cumulative_Avg_Positive     0.3618     0.0276     13.106     0.0000      0.3076      0.4159
log_Lagged_Cumulative_Avg_Negative    -0.2434     0.0195    -12.506     0.0000     -0.2816     -0.2052
log_Avg_Reviewer_Experience           -0.0005     0.0040    -0.1370     0.8910     -0.0084      0.0073
log_Avg_Days_Since_Review             -0.0684     0.1149    -0.5957     0.5515     -0.2938      0.1569
======================================================================================================

F-test for Poolability: 35.650
P-value: 0.0000
Distribution: F(455,1811)

Included effects: Entity, Time
In [ ]:
# 4- In pooled OLS, both leisure and business travelers show that an increase in cumulative reviews lowers the average score, likely reflecting selection bias as more mixed reviews accumulate over time.
# However, in fixed effects, cumulative reviews turn positive for both groups, meaning that within a hotel, as it gathers more reviews, its perceived quality improves — especially noticeable for business travelers, where the effect is stronger.
# Across both models, positive word accumulation strongly boosts the average score, slightly more so for leisure travelers, suggesting emotional or experiential appreciation.
# Negative word accumulation consistently reduces scores in all cases, but this negative impact is more intense for leisure travelers than for business, indicating leisure guests are more sensitive to negative sentiment.
# Overall, while pooled OLS captures general reputation effects, fixed effects reveal that within a hotel, building up reviews and increasing positive sentiment meaningfully improves ratings, particularly for business stays.
In [ ]:
# now going for creating table for another group ( type of the people or the number of people)
In [209]:
#  Creating dummy variables for each group type
group_types = {
    'is_couple': 'Couple',
    'is_family_young': 'Family with young children',
    'is_family_old': 'Family with older children',
    'is_group': 'Group',
    'is_solo': 'Solo traveler',
    'is_friends': 'Travelers with friends'
}

for col_name, tag_value in group_types.items():
    clean_uk_hotels_df[col_name] = clean_uk_hotels_df['Clean_Tags'].apply(
        lambda tags: int(tag_value in [tag.strip() for tag in tags]) if isinstance(tags, list) else 0
    )
In [210]:
#  Extracting Year-Week
clean_uk_hotels_df['Review_Date'] = pd.to_datetime(clean_uk_hotels_df['Review_Date'])
clean_uk_hotels_df['Year_Week'] = clean_uk_hotels_df['Review_Date'].dt.strftime('%Y-%U')
In [211]:
#  Grouping and calculate shares
group_summary = clean_uk_hotels_df.groupby(['Hotel_Name', 'Year_Week']).agg({
    'is_couple': 'mean',
    'is_family_young': 'mean',
    'is_family_old': 'mean',
    'is_group': 'mean',
    'is_solo': 'mean',
    'is_friends': 'mean'
}).reset_index()
In [212]:
# Renaming columns
group_summary = group_summary.rename(columns={
    'is_couple': 'Share_Couple',
    'is_family_young': 'Share_Family_Young',
    'is_family_old': 'Share_Family_Old',
    'is_group': 'Share_Group',
    'is_solo': 'Share_Solo',
    'is_friends': 'Share_Friends'
})
In [213]:
#  Merging with enhanced weekly summary to create group-enhanced summary
group_enhanced_weekly_summary = enhanced_weekly_summary.merge(
    group_summary,
    on=['Hotel_Name', 'Year_Week'],
    how='left'
)

group_enhanced_weekly_summary.head()
Out[213]:
Hotel_Name Year_Week Reviews_This_Week Sum_Scores_This_Week Avg_Score_This_Week Cumulative_Reviews Cumulative_Sum_Scores Cumulative_Avg_Score Total_Positive_Words_This_Week Total_Negative_Words_This_Week ... Share_Business_Trip_y Share_Leisure_Trip_y Share_Business_Trip Share_Leisure_Trip Share_Couple Share_Family_Young Share_Family_Old Share_Group Share_Solo Share_Friends
0 11 Cadogan Gardens 2015-32 1 10.0 10.00 1 10.0 10.00 7 0 ... 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0
1 11 Cadogan Gardens 2015-35 1 10.0 10.00 2 20.0 10.00 51 0 ... 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0
2 11 Cadogan Gardens 2015-36 2 13.8 6.90 4 33.8 8.45 16 46 ... 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0
3 11 Cadogan Gardens 2015-37 1 9.2 9.20 5 43.0 8.60 2 15 ... 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
4 11 Cadogan Gardens 2015-38 2 17.9 8.95 7 60.9 8.70 56 14 ... 0.0 1.0 0.0 1.0 0.5 0.0 0.0 0.5 0.0 0.0

5 rows × 28 columns

In [214]:
print(group_enhanced_weekly_summary.columns.tolist())
['Hotel_Name', 'Year_Week', 'Reviews_This_Week', 'Sum_Scores_This_Week', 'Avg_Score_This_Week', 'Cumulative_Reviews', 'Cumulative_Sum_Scores', 'Cumulative_Avg_Score', 'Total_Positive_Words_This_Week', 'Total_Negative_Words_This_Week', 'Cumulative_Sum_Positive', 'Cumulative_Sum_Negative', 'Cumulative_Avg_Positive', 'Cumulative_Avg_Negative', 'Avg_Days_Since_Review', 'Avg_Reviewer_Experience', 'Share_Business_Trip_x', 'Share_Leisure_Trip_x', 'Share_Business_Trip_y', 'Share_Leisure_Trip_y', 'Share_Business_Trip', 'Share_Leisure_Trip', 'Share_Couple', 'Share_Family_Young', 'Share_Family_Old', 'Share_Group', 'Share_Solo', 'Share_Friends']
In [ ]:
# just couple . whithout fix effect , pure = 1 
In [217]:
group_enhanced_weekly_summary = group_enhanced_weekly_summary.copy()

# Adding log variables (add +1 to avoid log(0))
group_enhanced_weekly_summary['log_Cumulative_Reviews'] = np.log(group_enhanced_weekly_summary['Cumulative_Reviews'] + 1)
group_enhanced_weekly_summary['log_Lagged_Cumulative_Avg_Positive'] = np.log(group_enhanced_weekly_summary['Cumulative_Avg_Positive'] + 1)
group_enhanced_weekly_summary['log_Lagged_Cumulative_Avg_Negative'] = np.log(group_enhanced_weekly_summary['Cumulative_Avg_Negative'] + 1)
group_enhanced_weekly_summary['log_Avg_Reviewer_Experience'] = np.log(group_enhanced_weekly_summary['Avg_Reviewer_Experience'] + 1)
group_enhanced_weekly_summary['log_Avg_Days_Since_Review'] = np.log(group_enhanced_weekly_summary['Avg_Days_Since_Review'] + 1)
In [218]:
group_enhanced_weekly_summary
Out[218]:
Hotel_Name Year_Week Reviews_This_Week Sum_Scores_This_Week Avg_Score_This_Week Cumulative_Reviews Cumulative_Sum_Scores Cumulative_Avg_Score Total_Positive_Words_This_Week Total_Negative_Words_This_Week ... Share_Family_Young Share_Family_Old Share_Group Share_Solo Share_Friends log_Cumulative_Reviews log_Lagged_Cumulative_Avg_Positive log_Lagged_Cumulative_Avg_Negative log_Avg_Reviewer_Experience log_Avg_Days_Since_Review
0 11 Cadogan Gardens 2015-32 1 10.0 10.000000 1 10.0 10.000000 7 0 ... 0.000000 0.0 0.000 0.00 0.0 0.693147 2.079442 0.000000 4.718499 6.582025
1 11 Cadogan Gardens 2015-35 1 10.0 10.000000 2 20.0 10.000000 51 0 ... 0.000000 0.0 0.000 0.00 0.0 1.098612 3.401197 0.000000 1.609438 6.558198
2 11 Cadogan Gardens 2015-36 2 13.8 6.900000 4 33.8 8.450000 16 46 ... 0.000000 0.0 0.000 0.00 0.0 1.609438 2.970414 2.525729 2.251292 6.542472
3 11 Cadogan Gardens 2015-37 1 9.2 9.200000 5 43.0 8.600000 2 15 ... 0.000000 0.0 0.000 1.00 0.0 1.791759 2.785011 2.580217 2.197225 6.535241
4 11 Cadogan Gardens 2015-38 2 17.9 8.950000 7 60.9 8.700000 56 14 ... 0.000000 0.0 0.500 0.00 0.0 2.079442 2.988564 2.460809 1.252763 6.522093
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
36129 every hotel Piccadilly 2017-27 3 25.4 8.466667 553 4966.0 8.980108 35 37 ... 0.333333 0.0 0.000 0.00 0.0 6.317165 2.864729 2.624616 1.540445 3.412247
36130 every hotel Piccadilly 2017-28 1 10.0 10.000000 554 4976.0 8.981949 8 0 ... 0.000000 0.0 0.000 0.00 0.0 6.318968 2.863850 2.622941 1.791759 3.258097
36131 every hotel Piccadilly 2017-29 2 14.2 7.100000 556 4990.2 8.975180 22 34 ... 0.000000 0.0 1.000 0.00 0.0 6.322565 2.862715 2.624043 1.791759 2.772589
36132 every hotel Piccadilly 2017-30 4 30.9 7.725000 560 5021.1 8.966250 87 128 ... 0.000000 0.0 0.000 0.25 0.0 6.329721 2.864850 2.633942 1.098612 2.251292
36133 every hotel Piccadilly 2017-31 8 72.6 9.075000 568 5093.7 8.967782 69 57 ... 0.250000 0.0 0.125 0.00 0.0 6.343880 2.858472 2.628056 1.321756 0.810930

36134 rows × 33 columns

In [ ]:
# without fixed effect for Share_Couple=1
In [223]:
couple_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Couple'] == 1.0].copy()

# Step 2: Define dependent and independent variables
Y = couple_pure_data['Cumulative_Avg_Score']

X = couple_pure_data[['log_Cumulative_Reviews',
                      'log_Lagged_Cumulative_Avg_Positive',
                      'log_Lagged_Cumulative_Avg_Negative',
                      'log_Avg_Reviewer_Experience',
                      'log_Avg_Days_Since_Review']]


X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.517
Model:                              OLS   Adj. R-squared:                  0.516
Method:                   Least Squares   F-statistic:                     984.9
Date:                  Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                          13:17:58   Log-Likelihood:                -2886.4
No. Observations:                  4612   AIC:                             5785.
Df Residuals:                      4606   BIC:                             5823.
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.2203      0.109     75.351      0.000       8.006       8.434
log_Cumulative_Reviews                -0.0147      0.007     -2.136      0.033      -0.028      -0.001
log_Lagged_Cumulative_Avg_Positive     1.0056      0.027     36.974      0.000       0.952       1.059
log_Lagged_Cumulative_Avg_Negative    -0.8028      0.016    -49.803      0.000      -0.834      -0.771
log_Avg_Reviewer_Experience            0.0262      0.009      3.050      0.002       0.009       0.043
log_Avg_Days_Since_Review             -0.0362      0.008     -4.591      0.000      -0.052      -0.021
==============================================================================
Omnibus:                      589.870   Durbin-Watson:                   0.484
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2528.550
Skew:                          -0.565   Prob(JB):                         0.00
Kurtosis:                       6.447   Cond. No.                         141.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# fixed effect for couple =1
In [221]:
couple_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Couple'] == 1.0].copy()

#  Converting 'Year_Week' to datetime for proper time index
couple_pure_data['Year_Week'] = pd.to_datetime(couple_pure_data['Year_Week'] + '-1', format='%Y-%W-%w')

#  Setting MultiIndex (Hotel_Name and Year_Week)
couple_pure_data = couple_pure_data.set_index(['Hotel_Name', 'Year_Week'])


couple_pure_data['log_Cumulative_Reviews'] = np.log1p(couple_pure_data['Cumulative_Reviews'])
couple_pure_data['log_Lagged_Cumulative_Avg_Positive'] = np.log1p(couple_pure_data['Cumulative_Avg_Positive'].shift(1).fillna(0))
couple_pure_data['log_Lagged_Cumulative_Avg_Negative'] = np.log1p(couple_pure_data['Cumulative_Avg_Negative'].shift(1).fillna(0))
couple_pure_data['log_Avg_Reviewer_Experience'] = np.log1p(couple_pure_data['Avg_Reviewer_Experience'])
couple_pure_data['log_Avg_Days_Since_Review'] = np.log1p(couple_pure_data['Avg_Days_Since_Review'])

Y = couple_pure_data['Cumulative_Avg_Score']
X = couple_pure_data[['log_Cumulative_Reviews',
                      'log_Lagged_Cumulative_Avg_Positive',
                      'log_Lagged_Cumulative_Avg_Negative',
                      'log_Avg_Reviewer_Experience',
                      'log_Avg_Days_Since_Review']]

X = sm.add_constant(X)

# Fixed Effects model (hotel + time)
model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.0265
Estimator:                     PanelOLS   R-squared (Between):              0.0445
No. Observations:                  4612   R-squared (Within):               0.0074
Date:                  Thu, May 08 2025   R-squared (Overall):              0.1041
Time:                          13:17:34   Log-likelihood                    414.12
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      22.607
Entities:                           357   P-value                           0.0000
Avg Obs:                         12.919   Distribution:                  F(5,4146)
Min Obs:                         1.0000                                           
Max Obs:                         50.000   F-statistic (robust):             22.607
                                          P-value                           0.0000
Time periods:                       105   Distribution:                  F(5,4146)
Avg Obs:                         43.924                                           
Min Obs:                         12.000                                           
Max Obs:                         136.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  9.2615     0.5929     15.620     0.0000      8.0990      10.424
log_Cumulative_Reviews                -0.0373     0.0161    -2.3215     0.0203     -0.0687     -0.0058
log_Lagged_Cumulative_Avg_Positive     0.0667     0.0209     3.1957     0.0014      0.0258      0.1076
log_Lagged_Cumulative_Avg_Negative    -0.1144     0.0113    -10.122     0.0000     -0.1366     -0.0923
log_Avg_Reviewer_Experience           -0.0023     0.0047    -0.4842     0.6282     -0.0116      0.0070
log_Avg_Days_Since_Review             -0.0558     0.1042    -0.5355     0.5923     -0.2601      0.1485
======================================================================================================

F-test for Poolability: 39.483
P-value: 0.0000
Distribution: F(460,4146)

Included effects: Entity, Time
In [ ]:
# 5-In the pooled model, couples’ ratings are highly sensitive to positive and negative sentiment, reviewer experience, and
# review recency. But once we control for hotel and time fixed effects, only the volume of reviews and word sentiment
# (positive/negative) remain significant — with positive word impact notably dropping and review freshness becoming 
# irrelevant. Fixed effects reveal that it’s not just the accumulation of reviews, but the evolving balance of positive
# and negative sentiment over time that shapes couple travelers' ratings within a hotel.
In [ ]:
# without fixed effect for solo tervaeller =1
In [105]:
solo_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Solo'] == 1.0].copy()

solo_pure_data['log_Cumulative_Reviews'] = np.log1p(solo_pure_data['Cumulative_Reviews'])
solo_pure_data['log_Lagged_Cumulative_Avg_Positive'] = np.log1p(solo_pure_data['Cumulative_Avg_Positive'])
solo_pure_data['log_Lagged_Cumulative_Avg_Negative'] = np.log1p(solo_pure_data['Cumulative_Avg_Negative'])
solo_pure_data['log_Avg_Reviewer_Experience'] = np.log1p(solo_pure_data['Avg_Reviewer_Experience'])
solo_pure_data['log_Avg_Days_Since_Review'] = np.log1p(solo_pure_data['Avg_Days_Since_Review'])

Y = solo_pure_data['Cumulative_Avg_Score']
X = solo_pure_data[['log_Cumulative_Reviews',
                    'log_Lagged_Cumulative_Avg_Positive',
                    'log_Lagged_Cumulative_Avg_Negative',
                    'log_Avg_Reviewer_Experience',
                    'log_Avg_Days_Since_Review']]

X = sm.add_constant(X)

model = sm.OLS(Y, X)
results = model.fit()

print(results.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.470
Model:                              OLS   Adj. R-squared:                  0.469
Method:                   Least Squares   F-statistic:                     321.1
Date:                  Wed, 07 May 2025   Prob (F-statistic):          1.73e-246
Time:                          17:47:35   Log-Likelihood:                -1439.8
No. Observations:                  1816   AIC:                             2892.
Df Residuals:                      1810   BIC:                             2925.
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  9.1226      0.192     47.518      0.000       8.746       9.499
log_Cumulative_Reviews                -0.0442      0.012     -3.561      0.000      -0.069      -0.020
log_Lagged_Cumulative_Avg_Positive     0.8195      0.044     18.763      0.000       0.734       0.905
log_Lagged_Cumulative_Avg_Negative    -0.8405      0.028    -30.145      0.000      -0.895      -0.786
log_Avg_Reviewer_Experience            0.0053      0.014      0.381      0.703      -0.022       0.032
log_Avg_Days_Since_Review             -0.0856      0.015     -5.563      0.000      -0.116      -0.055
==============================================================================
Omnibus:                      296.951   Durbin-Watson:                   0.624
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              878.555
Skew:                          -0.839   Prob(JB):                    1.68e-191
Kurtosis:                       5.966   Cond. No.                         130.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [106]:
solo_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Solo'] == 1.0].copy()

# Create log-transformed variables
solo_pure_data['log_Cumulative_Reviews'] = np.log1p(solo_pure_data['Cumulative_Reviews'])
solo_pure_data['log_Lagged_Cumulative_Avg_Positive'] = np.log1p(solo_pure_data['Cumulative_Avg_Positive'])
solo_pure_data['log_Lagged_Cumulative_Avg_Negative'] = np.log1p(solo_pure_data['Cumulative_Avg_Negative'])

# Define dependent and independent variables
Y = solo_pure_data['Cumulative_Avg_Score']
X = solo_pure_data[['log_Cumulative_Reviews',
                    'log_Lagged_Cumulative_Avg_Positive',
                    'log_Lagged_Cumulative_Avg_Negative']]

X = sm.add_constant(X)

# Fit the model
model = sm.OLS(Y, X)
results = model.fit()

# Print the results
print(results.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.461
Model:                              OLS   Adj. R-squared:                  0.460
Method:                   Least Squares   F-statistic:                     516.6
Date:                  Wed, 07 May 2025   Prob (F-statistic):          1.50e-242
Time:                          17:47:38   Log-Likelihood:                -1455.2
No. Observations:                  1816   AIC:                             2918.
Df Residuals:                      1812   BIC:                             2940.
Df Model:                             3                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.4411      0.147     57.304      0.000       8.152       8.730
log_Cumulative_Reviews                -0.0101      0.011     -0.931      0.352      -0.031       0.011
log_Lagged_Cumulative_Avg_Positive     0.8458      0.044     19.336      0.000       0.760       0.932
log_Lagged_Cumulative_Avg_Negative    -0.8442      0.028    -30.045      0.000      -0.899      -0.789
==============================================================================
Omnibus:                      271.239   Durbin-Watson:                   0.615
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              818.197
Skew:                          -0.763   Prob(JB):                    2.14e-178
Kurtosis:                       5.913   Cond. No.                         72.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# with fixed effect for Solo =1
In [107]:
# Setting multi-index for panel data
solo_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Solo'] == 1.0]
solo_pure_data = solo_pure_data.copy()  # avoid SettingWithCopyWarning
solo_pure_data['Year_Week'] = pd.to_datetime(solo_pure_data['Year_Week'] + '-1', format='%Y-%W-%w')
solo_pure_data = solo_pure_data.set_index(['Hotel_Name', 'Year_Week'])

solo_pure_data['log_Cumulative_Reviews'] = np.log1p(solo_pure_data['Cumulative_Reviews'])
solo_pure_data['log_Lagged_Cumulative_Avg_Positive'] = np.log1p(solo_pure_data['Cumulative_Avg_Positive'].shift(1))
solo_pure_data['log_Lagged_Cumulative_Avg_Negative'] = np.log1p(solo_pure_data['Cumulative_Avg_Negative'].shift(1))
solo_pure_data['log_Avg_Reviewer_Experience'] = np.log1p(solo_pure_data['Avg_Reviewer_Experience'])
solo_pure_data['log_Avg_Days_Since_Review'] = np.log1p(solo_pure_data['Avg_Days_Since_Review'])

# Droping missing values after lag
solo_pure_data = solo_pure_data.dropna()

Y = solo_pure_data['Cumulative_Avg_Score']
X = solo_pure_data[['log_Cumulative_Reviews',
                    'log_Lagged_Cumulative_Avg_Positive',
                    'log_Lagged_Cumulative_Avg_Negative',
                    'log_Avg_Reviewer_Experience',
                    'log_Avg_Days_Since_Review']]

X = sm.add_constant(X)

model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.0144
Estimator:                     PanelOLS   R-squared (Between):              0.0568
No. Observations:                  1815   R-squared (Within):              -0.0683
Date:                  Wed, May 07 2025   R-squared (Overall):              0.0466
Time:                          17:47:41   Log-likelihood                    194.88
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      4.0943
Entities:                           302   P-value                           0.0011
Avg Obs:                         6.0099   Distribution:                  F(5,1404)
Min Obs:                         1.0000                                           
Max Obs:                         34.000   F-statistic (robust):             4.0943
                                          P-value                           0.0011
Time periods:                       105   Distribution:                  F(5,1404)
Avg Obs:                         17.286                                           
Min Obs:                         5.0000                                           
Max Obs:                         86.000                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  8.5733     0.8948     9.5812     0.0000      6.8180      10.329
log_Cumulative_Reviews                -0.0807     0.0245    -3.3003     0.0010     -0.1287     -0.0327
log_Lagged_Cumulative_Avg_Positive     0.0838     0.0319     2.6291     0.0087      0.0213      0.1464
log_Lagged_Cumulative_Avg_Negative     0.0060     0.0183     0.3279     0.7430     -0.0299      0.0418
log_Avg_Reviewer_Experience           -0.0120     0.0073    -1.6332     0.1026     -0.0264      0.0024
log_Avg_Days_Since_Review             -0.0064     0.1568    -0.0407     0.9675     -0.3140      0.3013
======================================================================================================

F-test for Poolability: 24.281
P-value: 0.0000
Distribution: F(405,1404)

Included effects: Entity, Time
In [108]:
# Filtering data for solo travelers only (100% solo share)
solo_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Solo'] == 1.0].copy()

# Converting Year_Week to datetime and setting panel index
solo_pure_data['Year_Week'] = pd.to_datetime(solo_pure_data['Year_Week'] + '-1', format='%Y-%W-%w')
solo_pure_data = solo_pure_data.set_index(['Hotel_Name', 'Year_Week'])

# Creating log-transformed variables
solo_pure_data['log_Cumulative_Reviews'] = np.log1p(solo_pure_data['Cumulative_Reviews'])
solo_pure_data['log_Lagged_Cumulative_Avg_Positive'] = np.log1p(solo_pure_data['Cumulative_Avg_Positive'].shift(1))
solo_pure_data['log_Lagged_Cumulative_Avg_Negative'] = np.log1p(solo_pure_data['Cumulative_Avg_Negative'].shift(1))

# Dropping missing values after lagging
solo_pure_data = solo_pure_data.dropna()

# Defining dependent and independent variables
Y = solo_pure_data['Cumulative_Avg_Score']
X = solo_pure_data[['log_Cumulative_Reviews',
                    'log_Lagged_Cumulative_Avg_Positive',
                    'log_Lagged_Cumulative_Avg_Negative']]

X = sm.add_constant(X)

# Fitting fixed effects model with hotel and time effects
model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()


print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.0125
Estimator:                     PanelOLS   R-squared (Between):              0.0562
No. Observations:                  1815   R-squared (Within):              -0.0830
Date:                  Wed, May 07 2025   R-squared (Overall):              0.0459
Time:                          17:47:44   Log-likelihood                    193.16
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      5.9317
Entities:                           302   P-value                           0.0005
Avg Obs:                         6.0099   Distribution:                  F(3,1406)
Min Obs:                         1.0000                                           
Max Obs:                         34.000   F-statistic (robust):             5.9317
                                          P-value                           0.0005
Time periods:                       105   Distribution:                  F(3,1406)
Avg Obs:                         17.286                                           
Min Obs:                         5.0000                                           
Max Obs:                         86.000                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  8.5312     0.1466     58.194     0.0000      8.2436      8.8188
log_Cumulative_Reviews                -0.0827     0.0244    -3.3853     0.0007     -0.1306     -0.0348
log_Lagged_Cumulative_Avg_Positive     0.0818     0.0319     2.5683     0.0103      0.0193      0.1443
log_Lagged_Cumulative_Avg_Negative     0.0054     0.0183     0.2957     0.7675     -0.0304      0.0413
======================================================================================================

F-test for Poolability: 24.948
P-value: 0.0000
Distribution: F(405,1406)

Included effects: Entity, Time
In [ ]:
# 6-For solo travelers, both models show that more cumulative reviews decrease hotel scores, but the decline is sharper in
# the fixed effects model, revealing a stronger within-hotel dynamic over time. Positive sentiment is a major driver in 
# pooled OLS, yet its influence weakens under fixed effects, indicating much of the effect is between hotels rather than 
# within. Notably, negative sentiment strongly lowers scores in pooled OLS but loses significance in fixed effects, 
# implying external perceptions dominate rather than internal shifts. Variables like reviewer experience and days since 
# review matter in pooled OLS but fade in fixed effects as hotel and time-specific factors absorb their variation.
# Overall, pooled OLS captures external reputation effects across hotels, while fixed effects isolate the internal 
# erosion of scores as reviews accumulate.
In [109]:
# Checking for NaNs or infinite values in X and Y
print(X.isna().sum())
print(np.isinf(X).sum())
print(Y.isna().sum())
print(np.isinf(Y).sum())
const                                 0
log_Cumulative_Reviews                0
log_Lagged_Cumulative_Avg_Positive    0
log_Lagged_Cumulative_Avg_Negative    0
dtype: int64
const                                 0
log_Cumulative_Reviews                0
log_Lagged_Cumulative_Avg_Positive    0
log_Lagged_Cumulative_Avg_Negative    0
dtype: int64
0
0
In [110]:
# Droping rows with any NaNs (safe, only first lag row)
data_for_model = pd.concat([X, Y], axis=1).dropna()

# Separating cleaned X and Y
Y = data_for_model['Cumulative_Avg_Score']
X = data_for_model.drop(columns=['Cumulative_Avg_Score'])
In [ ]:
# Now using interaction term for solo instead of splitting method on without fix effect 
In [111]:
group_column = 'Share_Solo'
pure_group_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary[group_column] == 1.0].copy()

#  Creating log variables 
epsilon = 1e-6
pure_group_data['log_Cumulative_Reviews'] = (pure_group_data['Cumulative_Reviews'] + epsilon).apply(np.log)
pure_group_data['log_Lagged_Cumulative_Avg_Positive'] = (pure_group_data['Cumulative_Avg_Positive'].shift(1) + epsilon).apply(np.log)
pure_group_data['log_Lagged_Cumulative_Avg_Negative'] = (pure_group_data['Cumulative_Avg_Negative'].shift(1) + epsilon).apply(np.log)
pure_group_data['log_Avg_Reviewer_Experience'] = (pure_group_data['Avg_Reviewer_Experience'] + epsilon).apply(np.log)
pure_group_data['log_Avg_Days_Since_Review'] = (pure_group_data['Avg_Days_Since_Review'] + epsilon).apply(np.log)


pure_group_data['interaction_CumReviews_Solo'] = pure_group_data['log_Cumulative_Reviews']
pure_group_data['interaction_Positive_Solo'] = pure_group_data['log_Lagged_Cumulative_Avg_Positive']
pure_group_data['interaction_Negative_Solo'] = pure_group_data['log_Lagged_Cumulative_Avg_Negative']
pure_group_data['interaction_Experience_Solo'] = pure_group_data['log_Avg_Reviewer_Experience']
pure_group_data['interaction_DaysSinceReview_Solo'] = pure_group_data['log_Avg_Days_Since_Review']

#  Preparing X and Y
Y = pure_group_data['Cumulative_Avg_Score']
X = pure_group_data[[
    'log_Cumulative_Reviews',
    'log_Lagged_Cumulative_Avg_Positive',
    'log_Lagged_Cumulative_Avg_Negative',
    'log_Avg_Reviewer_Experience',
    'log_Avg_Days_Since_Review',
    'interaction_CumReviews_Solo',
    'interaction_Positive_Solo',
    'interaction_Negative_Solo',
    'interaction_Experience_Solo',
    'interaction_DaysSinceReview_Solo'
]]

X = sm.add_constant(X)

data_for_model = pd.concat([X, Y], axis=1).dropna()
Y_clean = data_for_model['Cumulative_Avg_Score']
X_clean = data_for_model.drop(columns=['Cumulative_Avg_Score'])

model = sm.OLS(Y_clean, X_clean).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.154
Model:                              OLS   Adj. R-squared:                  0.152
Method:                   Least Squares   F-statistic:                     65.85
Date:                  Wed, 07 May 2025   Prob (F-statistic):           2.57e-63
Time:                          17:47:52   Log-Likelihood:                -1864.0
No. Observations:                  1815   AIC:                             3740.
Df Residuals:                      1809   BIC:                             3773.
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                  8.0805      0.210     38.450      0.000       7.668       8.493
log_Cumulative_Reviews                -0.0653      0.007     -9.346      0.000      -0.079      -0.052
log_Lagged_Cumulative_Avg_Positive     0.2993      0.025     11.801      0.000       0.250       0.349
log_Lagged_Cumulative_Avg_Negative    -0.0370      0.005     -7.376      0.000      -0.047      -0.027
log_Avg_Reviewer_Experience            0.0034      0.007      0.477      0.633      -0.010       0.017
log_Avg_Days_Since_Review             -0.0507      0.009     -5.456      0.000      -0.069      -0.032
interaction_CumReviews_Solo           -0.0653      0.007     -9.346      0.000      -0.079      -0.052
interaction_Positive_Solo              0.2993      0.025     11.801      0.000       0.250       0.349
interaction_Negative_Solo             -0.0370      0.005     -7.376      0.000      -0.047      -0.027
interaction_Experience_Solo            0.0034      0.007      0.477      0.633      -0.010       0.017
interaction_DaysSinceReview_Solo      -0.0507      0.009     -5.456      0.000      -0.069      -0.032
==============================================================================
Omnibus:                      151.830   Durbin-Watson:                   0.656
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              197.792
Skew:                          -0.713   Prob(JB):                     1.12e-43
Kurtosis:                       3.761   Cond. No.                     3.98e+17
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.55e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [ ]:
# interaction for solo with fixed effect 
In [112]:
interaction_data = group_enhanced_weekly_summary.copy()

interaction_data['Year_Week'] = pd.to_datetime(interaction_data['Year_Week'] + '-1', format='%Y-%W-%w')


interaction_data['log_Cumulative_Reviews'] = np.log1p(interaction_data['Cumulative_Reviews'])
interaction_data['log_Lagged_Cumulative_Avg_Positive'] = np.log1p(interaction_data['Cumulative_Avg_Positive'])
interaction_data['log_Lagged_Cumulative_Avg_Negative'] = np.log1p(interaction_data['Cumulative_Avg_Negative'])
interaction_data['log_Avg_Reviewer_Experience'] = np.log1p(interaction_data['Avg_Reviewer_Experience'])
interaction_data['log_Avg_Days_Since_Review'] = np.log1p(interaction_data['Avg_Days_Since_Review'])

#  Interaction terms for Solo
interaction_data['interaction_CumReviews_Solo'] = interaction_data['log_Cumulative_Reviews'] * interaction_data['Share_Solo']
interaction_data['interaction_Positive_Solo'] = interaction_data['log_Lagged_Cumulative_Avg_Positive'] * interaction_data['Share_Solo']
interaction_data['interaction_Negative_Solo'] = interaction_data['log_Lagged_Cumulative_Avg_Negative'] * interaction_data['Share_Solo']
interaction_data['interaction_Experience_Solo'] = interaction_data['log_Avg_Reviewer_Experience'] * interaction_data['Share_Solo']
interaction_data['interaction_DaysSinceReview_Solo'] = interaction_data['log_Avg_Days_Since_Review'] * interaction_data['Share_Solo']

# Setting index for panel data
interaction_data = interaction_data.set_index(['Hotel_Name', 'Year_Week'])

#  Defining dependent and independent variables
Y = interaction_data['Cumulative_Avg_Score']

X = interaction_data[[
    'log_Cumulative_Reviews',
    'log_Lagged_Cumulative_Avg_Positive',
    'log_Lagged_Cumulative_Avg_Negative',
    'log_Avg_Reviewer_Experience',
    'log_Avg_Days_Since_Review',
    'interaction_CumReviews_Solo',
    'interaction_Positive_Solo',
    'interaction_Negative_Solo',
    'interaction_Experience_Solo',
    'interaction_DaysSinceReview_Solo'
]]

X = sm.add_constant(X)

model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.2871
Estimator:                     PanelOLS   R-squared (Between):              0.3574
No. Observations:                 36134   R-squared (Within):               0.2674
Date:                  Wed, May 07 2025   R-squared (Overall):              0.3687
Time:                          17:47:56   Log-likelihood                 1.481e+04
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      1434.3
Entities:                           400   P-value                           0.0000
Avg Obs:                         90.335   Distribution:                F(10,35620)
Min Obs:                        10.0000                                           
Max Obs:                         106.00   F-statistic (robust):             1434.3
                                          P-value                           0.0000
Time periods:                       105   Distribution:                F(10,35620)
Avg Obs:                         344.13                                           
Min Obs:                         299.00                                           
Max Obs:                         496.00                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  7.9179     0.1732     45.711     0.0000      7.5783      8.2574
log_Cumulative_Reviews                 0.0546     0.0037     14.562     0.0000      0.0473      0.0620
log_Lagged_Cumulative_Avg_Positive     0.5276     0.0077     68.274     0.0000      0.5124      0.5427
log_Lagged_Cumulative_Avg_Negative    -0.4390     0.0048    -91.376     0.0000     -0.4484     -0.4296
log_Avg_Reviewer_Experience            0.0061     0.0019     3.2286     0.0012      0.0024      0.0097
log_Avg_Days_Since_Review             -0.0057     0.0307    -0.1870     0.8517     -0.0659      0.0544
interaction_CumReviews_Solo            0.0006     0.0031     0.1873     0.8514     -0.0055      0.0067
interaction_Positive_Solo              0.0034     0.0089     0.3852     0.7001     -0.0140      0.0208
interaction_Negative_Solo              0.0114     0.0077     1.4764     0.1398     -0.0037      0.0266
interaction_Experience_Solo           -0.0155     0.0043    -3.6144     0.0003     -0.0238     -0.0071
interaction_DaysSinceReview_Solo      -0.0043     0.0033    -1.3072     0.1911     -0.0106      0.0021
======================================================================================================

F-test for Poolability: 380.77
P-value: 0.0000
Distribution: F(503,35620)

Included effects: Entity, Time
In [ ]:
# now without fix effect for goup 'Family with Young Children'=1
In [ ]:
#  Filter pure Family with Young Children data
family_young_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Family_Young'] == 1.0]

Y = family_young_pure_data['Cumulative_Avg_Score']


X = family_young_pure_data[[
    'log_Cumulative_Reviews',
    'log_Lagged_Cumulative_Avg_Positive',
    'log_Lagged_Cumulative_Avg_Negative',
    'log_Avg_Reviewer_Experience',
    'log_Avg_Days_Since_Review'
]]


X = sm.add_constant(X)


model = sm.OLS(Y, X).fit()

print(model.summary())
In [ ]:
# Same with fixxed effect 
In [115]:
family_young_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Family_Young'] == 1.0].copy()

#  Preparing panel index 
if not np.issubdtype(family_young_pure_data['Year_Week'].dtype, np.datetime64):
    family_young_pure_data['Year_Week'] = pd.to_datetime(family_young_pure_data['Year_Week'].astype(str) + '-1', format='%Y-%W-%w')

family_young_pure_data = family_young_pure_data.set_index(['Hotel_Name', 'Year_Week'])

Y = family_young_pure_data['Cumulative_Avg_Score']
X = family_young_pure_data[['log_Cumulative_Reviews',
                            'log_Lagged_Cumulative_Avg_Positive',
                            'log_Lagged_Cumulative_Avg_Negative',
                            'log_Avg_Reviewer_Experience',
                            'log_Avg_Days_Since_Review']]


X = sm.add_constant(X)
model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.3531
Estimator:                     PanelOLS   R-squared (Between):              0.3396
No. Observations:                   722   R-squared (Within):               0.1055
Date:                  Wed, May 07 2025   R-squared (Overall):              0.3633
Time:                          17:48:08   Log-likelihood                    324.90
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      42.133
Entities:                           229   P-value                           0.0000
Avg Obs:                         3.1528   Distribution:                   F(5,386)
Min Obs:                         1.0000                                           
Max Obs:                         15.000   F-statistic (robust):             42.133
                                          P-value                           0.0000
Time periods:                       103   Distribution:                   F(5,386)
Avg Obs:                         7.0097                                           
Min Obs:                         1.0000                                           
Max Obs:                         22.000                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                  10.019     0.9426     10.629     0.0000      8.1657      11.872
log_Cumulative_Reviews                -0.0784     0.0518    -1.5137     0.1309     -0.1803      0.0234
log_Lagged_Cumulative_Avg_Positive     0.1843     0.0675     2.7288     0.0066      0.0515      0.3171
log_Lagged_Cumulative_Avg_Negative    -0.4945     0.0346    -14.290     0.0000     -0.5625     -0.4264
log_Avg_Reviewer_Experience            0.0021     0.0138     0.1517     0.8795     -0.0250      0.0291
log_Avg_Days_Since_Review             -0.0483     0.1771    -0.2726     0.7853     -0.3964      0.2999
======================================================================================================

F-test for Poolability: 10.033
P-value: 0.0000
Distribution: F(330,386)

Included effects: Entity, Time
In [ ]:
# 7-In the pooled OLS model, cumulative reviews significantly reduce scores, suggesting visible negative reputation effects
# as reviews accumulate across hotels. Positive and negative word sentiment are both strong and significant, showing 
# clear influence on hotel ratings. In fixed effects, cumulative reviews lose significance, indicating that within-hotel
# changes in review quantity are less impactful when controlling for hotel-specific factors. Positive and especially 
# negative words remain important but with reduced magnitude, meaning review content still shapes scores over time. 
# Overall, while pooled OLS captures broader reputation trends, fixed effects reveal that sentiment, not sheer volume, 
# drives score variation within hotels.
In [ ]:
# Now without fix effect for 'pure Family with Older Children ' pure =1
In [116]:
family_old_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Family_Old'] == 1.0].copy()

Y = family_old_pure_data['Cumulative_Avg_Score']
X = family_old_pure_data[['log_Cumulative_Reviews',
                          'log_Lagged_Cumulative_Avg_Positive',
                          'log_Lagged_Cumulative_Avg_Negative',
                          'log_Avg_Reviewer_Experience',
                          'log_Avg_Days_Since_Review']]

# without fixed effects
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()

print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                       0.473
Model:                              OLS   Adj. R-squared:                  0.464
Method:                   Least Squares   F-statistic:                     53.19
Date:                  Wed, 07 May 2025   Prob (F-statistic):           2.82e-39
Time:                          17:48:12   Log-Likelihood:                -241.42
No. Observations:                   302   AIC:                             494.8
Df Residuals:                       296   BIC:                             517.1
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                 12.1869      1.216     10.025      0.000       9.795      14.579
log_Cumulative_Reviews                -0.1067      0.039     -2.732      0.007      -0.183      -0.030
log_Lagged_Cumulative_Avg_Positive     0.7396      0.100      7.425      0.000       0.544       0.936
log_Lagged_Cumulative_Avg_Negative    -0.7093      0.062    -11.369      0.000      -0.832      -0.587
log_Avg_Reviewer_Experience            0.0094      0.038      0.248      0.804      -0.065       0.084
log_Avg_Days_Since_Review             -0.5430      0.172     -3.160      0.002      -0.881      -0.205
==============================================================================
Omnibus:                       32.366   Durbin-Watson:                   1.323
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               48.203
Skew:                          -0.690   Prob(JB):                     3.41e-11
Kurtosis:                       4.388   Cond. No.                         330.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [117]:
family_old_pure_data = group_enhanced_weekly_summary[group_enhanced_weekly_summary['Share_Family_Old'] == 1.0].copy()

# Converting 'Year_Week' to datetime and set multi-index
family_old_pure_data = family_old_pure_data.reset_index(drop=True)
family_old_pure_data['Year_Week'] = pd.to_datetime(family_old_pure_data['Year_Week'].astype(str) + '-1', format='%Y-%W-%w')
family_old_pure_data = family_old_pure_data.set_index(['Hotel_Name', 'Year_Week'])

#  Defining dependent and independent variables
Y = family_old_pure_data['Cumulative_Avg_Score']
X = family_old_pure_data[['log_Cumulative_Reviews',
                          'log_Lagged_Cumulative_Avg_Positive',
                          'log_Lagged_Cumulative_Avg_Negative',
                          'log_Avg_Reviewer_Experience',
                          'log_Avg_Days_Since_Review']]

X = sm.add_constant(X)

# Hotel and Time effects
model_fe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
results_fe = model_fe.fit()

print(results_fe.summary)
                           PanelOLS Estimation Summary                            
==================================================================================
Dep. Variable:     Cumulative_Avg_Score   R-squared:                        0.3149
Estimator:                     PanelOLS   R-squared (Between):             -0.0059
No. Observations:                   302   R-squared (Within):              -0.9954
Date:                  Wed, May 07 2025   R-squared (Overall):             -0.0442
Time:                          17:48:15   Log-likelihood                    116.94
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      6.9881
Entities:                           160   P-value                           0.0000
Avg Obs:                         1.8875   Distribution:                    F(5,76)
Min Obs:                         1.0000                                           
Max Obs:                         8.0000   F-statistic (robust):             6.9881
                                          P-value                           0.0000
Time periods:                        62   Distribution:                    F(5,76)
Avg Obs:                         4.8710                                           
Min Obs:                         1.0000                                           
Max Obs:                         16.000                                           
                                                                                  
                                         Parameter Estimates                                          
======================================================================================================
                                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------------
const                                 -1.7217     52.872    -0.0326     0.9741     -107.02      103.58
log_Cumulative_Reviews                 0.0160     0.1923     0.0833     0.9338     -0.3669      0.3990
log_Lagged_Cumulative_Avg_Positive     0.3171     0.1281     2.4754     0.0155      0.0620      0.5722
log_Lagged_Cumulative_Avg_Negative    -0.5272     0.1106    -4.7646     0.0000     -0.7476     -0.3068
log_Avg_Reviewer_Experience           -0.0435     0.0430    -1.0105     0.3155     -0.1291      0.0422
log_Avg_Days_Since_Review              1.7393     8.4583     0.2056     0.8376     -15.107      18.586
======================================================================================================

F-test for Poolability: 3.3619
P-value: 0.0000
Distribution: F(220,76)

Included effects: Entity, Time
In [ ]:
# 8- In the pooled regression (Table 21), cumulative reviews have a strong negative impact (-0.1067, p = 0.007), but this
# effect disappears in the fixed effect model (Table 22), where it becomes non-significant (0.0160, p = 0.933). This 
# suggests that the apparent influence of review count in the pooled model was largely driven by differences across
# hotels rather than true within-hotel dynamics over time.

# Both models agree that positive and negative words are meaningful predictors: positive sentiment consistently increases
# average scores (Pooled: 0.7396, Fixed: 0.3171) and negative sentiment lowers them (Pooled: -0.7093, Fixed: -0.5272),
# with both effects significant across specifications. Although the magnitudes differ (effects shrink in fixed effects), 
# the direction and significance remain robust, underlining the importance of sentiment in shaping hotel ratings.

# Reviewer experience and days since review are not significant in the fixed model, while in the pooled model, only days 
# since review showed a significant negative impact (-0.5430, p = 0.002), which disappears with fixed effects. This
# suggests that temporal freshness mattered across hotels but not within the same hotel over time.
In [ ]:
# Overall models (General population):
# In the fixed effect specification, we found that cumulative number of reviews had a positive and significant effect on 
# cumulative average score — indicating that, over time, as hotels gather more reviews, their average scores tend to 
# improve slightly (Table 9–10).
# However, this effect was not very large, and interestingly, positive words (positive sentiment) strongly and 
# significantly increased average scores, while negative words (negative sentiment) had a strong decreasing effect. 
# Other controls like reviewer experience and review recency were mostly insignificant.

# Segmenting by purpose of travel (Leisure vs. Business):
# When we looked specifically at leisure travelers (Table 14 fixed effects), the effect of cumulative reviews was
# positive and significant, similar to the general pattern, but the effect of positive words was very strong, and 
# negative words had a strong negative effect.
# For business travelers, the fixed effects model showed an even stronger role of cumulative reviews
# (coefficient: +0.0856) indicating that business travelers were more sensitive to the accumulation of reviews over time,
# perhaps valuing reliability. However, while positive and negative sentiment still mattered, their magnitude was smaller
# compared to leisure travelers.
# Key insight: Leisure travelers cared strongly about sentiment (positive/negative words), while business travelers were
#     somewhat more influenced by the accumulated volume of reviews.

# Segmenting by group type (Couples, Solo, Families):

# Couple travelers (Table 16): Fixed effect results showed cumulative reviews had a significant negative effect 
#     (-0.0373), suggesting score dilution over time, while positive sentiment had a positive effect, and negative 
#     sentiment had a clear negative impact.

# Solo travelers (Table 18): Fixed effects revealed cumulative reviews were significantly negative (-0.0807), 
#     showing solo travelers may be particularly sensitive to the dilution effect as reviews accumulate. Surprisingly, 
#     the effect of positive words weakened, and negative words were not significant, implying solo travelers may be less
#     driven by review sentiment over time.

# Families with young children (Table 20): Cumulative reviews had a negative but not statistically significant effect,
#     while positive words still had a significant positive impact, and negative words had a strong negative impact.

# Families with older children (Table 22): Fixed effects results showed no significant effect for cumulative reviews,
#     but positive words remained significantly positive, and negative words significantly negative. However, other
#     variables were not meaningful, likely due to a smaller sample size.
In [ ]:
# Across all fixed effect models, positive words consistently emerged as the strongest driver of higher hotel scores,
# especially pronounced for leisure travelers and families with older children, showing their emotional responsiveness 
# to positive sentiments. Negative words uniformly decreased scores, but the effect was especially severe for families 
# with young children and couples, indicating their heightened sensitivity to negative experiences. The accumulation of 
# reviews had varying impacts: while business travelers benefited most from accumulating reviews — perhaps valuing 
#     reliability and consistency — solo travelers and couples saw negative effects, suggesting potential review fatigue
#     or growing criticism over time in these segments. Families with older children showed no significant sensitivity to
#     review accumulation, indicating other factors shape their perceptions. Notably, solo travelers were least
#     responsive to both positive and negative sentiments in fixed effects, implying they might prioritize different
#     criteria, like convenience or price. In contrast, leisure travelers were highly responsive to sentiment, making 
#     them the most emotionally driven segment. Overall, fixed effects models provided clearer, more credible patterns 
#     than pooled OLS, which tended to exaggerate correlations due to unobserved hotel differences. This highlights the
#     power of fixed effects in uncovering genuine within-hotel dynamics across different traveler profiles.
In [ ]:
# Methodological Reflection: Fixed Effects vs. Pooled OLS
# Fixed effects models allowed us to control for unobserved, time-invariant heterogeneity across hotels — things like 
# hotel location, star rating, size, management style. By focusing on within-hotel variation, we gained more reliable
# insights about how changes over time in variables like cumulative reviews and sentiment impacted scores.

# In contrast, pooled OLS models captured both within- and between-hotel variation. While they showed strong significance
# for many variables, they were biased by hotel-level factors. For instance, better hotels naturally have more positive
# reviews and higher scores, which inflated coefficients in pooled models.

# Importantly, in pooled OLS, cumulative reviews often appeared as a strong negative driver, but in fixed effects, the 
# story became more nuanced: for some groups (like business travelers), the accumulation was positive; for others 
#     (solo, couples), it was negative, indicating group-specific temporal dynamics that pooled models masked.

# Conclusion: Fixed effects give us a clearer, causally reliable picture of how scores evolve within hotels over time, 
#     especially for sentiment effects, while pooled OLS risks overstating effects due to hotel-level differences.
In [118]:
enhanced_weekly_summary.head()
Out[118]:
Hotel_Name Year_Week Reviews_This_Week Sum_Scores_This_Week Avg_Score_This_Week Cumulative_Reviews Cumulative_Sum_Scores Cumulative_Avg_Score Total_Positive_Words_This_Week Total_Negative_Words_This_Week Cumulative_Sum_Positive Cumulative_Sum_Negative Cumulative_Avg_Positive Cumulative_Avg_Negative Avg_Days_Since_Review Avg_Reviewer_Experience Share_Business_Trip Share_Leisure_Trip
0 11 Cadogan Gardens 2015-32 1 10.0 10.00 1 10.0 10.00 7 0 7 0 7.000000 0.000000 721.0 111.0 0.0 1.0
1 11 Cadogan Gardens 2015-35 1 10.0 10.00 2 20.0 10.00 51 0 58 0 29.000000 0.000000 704.0 4.0 0.0 1.0
2 11 Cadogan Gardens 2015-36 2 13.8 6.90 4 33.8 8.45 16 46 74 46 18.500000 11.500000 693.0 8.5 0.0 1.0
3 11 Cadogan Gardens 2015-37 1 9.2 9.20 5 43.0 8.60 2 15 76 61 15.200000 12.200000 688.0 8.0 1.0 0.0
4 11 Cadogan Gardens 2015-38 2 17.9 8.95 7 60.9 8.70 56 14 132 75 18.857143 10.714286 679.0 2.5 0.0 1.0
In [121]:
enhanced_weekly_summary['Year_Month'] = pd.to_datetime(
    enhanced_weekly_summary['Year_Week'].astype(str) + '-1',
    format='%Y-%U-%w'
).dt.to_period('M').dt.to_timestamp()


hotel_review_counts = (
    enhanced_weekly_summary.groupby('Hotel_Name')['Cumulative_Reviews']
    .max()
    .sort_values(ascending=False)
)


middle_start = len(hotel_review_counts) // 2 - 5
middle_10_hotels = hotel_review_counts.iloc[middle_start:middle_start + 10].index.tolist()


middle_hotels_data = enhanced_weekly_summary[
    enhanced_weekly_summary['Hotel_Name'].isin(middle_10_hotels)
]

# Step 5: Group data by hotel and month
monthly_reviews_middle = (
    middle_hotels_data.groupby(['Year_Month', 'Hotel_Name'])['Cumulative_Reviews']
    .sum()
    .reset_index()
)


plt.figure(figsize=(14, 7))

for hotel in middle_10_hotels:
    hotel_data = monthly_reviews_middle[monthly_reviews_middle['Hotel_Name'] == hotel]
    plt.plot(hotel_data['Year_Month'], hotel_data['Cumulative_Reviews'], label=hotel)


for year in enhanced_weekly_summary['Year_Month'].dt.year.unique():
    plt.axvline(pd.Timestamp(f'{year}-12-15'), color='red', linestyle='--', alpha=0.5,
                label='Christmas/New Year' if year == enhanced_weekly_summary['Year_Month'].dt.year.unique()[0] else "")
    plt.axvline(pd.Timestamp(f'{year}-07-01'), color='green', linestyle='--', alpha=0.5,
                label='Summer Peak' if year == enhanced_weekly_summary['Year_Month'].dt.year.unique()[0] else "")


plt.title('Monthly Number of Reviews for Middle 10 Hotels (with Seasonality Markers)')
plt.xlabel('Month')
plt.ylabel('Number of Reviews')
plt.legend(loc='upper right', fontsize='small')
plt.grid(True)
plt.tight_layout()

# Save plot as PNG
plt.savefig("middle_10_hotels_monthly_reviews.png", dpi=300)
plt.show()
In [137]:
enhanced_weekly_summary['Year_Month'] = enhanced_weekly_summary['Year_Week'].dt.to_period('M').dt.to_timestamp()


hotel_review_counts = (
    enhanced_weekly_summary.groupby('Hotel_Name')['Cumulative_Reviews']
    .max()
    .sort_values(ascending=False)
)


middle_start = len(hotel_review_counts) // 2 - 5
middle_10_hotels = hotel_review_counts.iloc[middle_start:middle_start + 10].index.tolist()


middle_hotels_data = enhanced_weekly_summary[enhanced_weekly_summary['Hotel_Name'].isin(middle_10_hotels)]


monthly_reviews_middle = (
    middle_hotels_data.groupby(['Year_Month', 'Hotel_Name'])['Cumulative_Reviews']
    .sum()
    .reset_index()
)


plt.figure(figsize=(14, 7))

for hotel in middle_10_hotels:
    hotel_data = monthly_reviews_middle[monthly_reviews_middle['Hotel_Name'] == hotel].sort_values('Year_Month')
  
    hotel_data['Smoothed_Reviews'] = hotel_data['Cumulative_Reviews'].rolling(window=3, center=True, min_periods=1).mean()
    plt.plot(hotel_data['Year_Month'], hotel_data['Smoothed_Reviews'], label=hotel)


for year in enhanced_weekly_summary['Year_Month'].dt.year.unique():
    plt.axvline(pd.Timestamp(f'{year}-12-15'), color='red', linestyle='--', alpha=0.5, label='Christmas/New Year' if year == enhanced_weekly_summary['Year_Month'].dt.year.unique()[0] else "")
    plt.axvline(pd.Timestamp(f'{year}-07-01'), color='green', linestyle='--', alpha=0.5, label='Summer Peak' if year == enhanced_weekly_summary['Year_Month'].dt.year.unique()[0] else "")

plt.title('Monthly Number of Reviews (Smoothed) for Middle 10 Hotels with Seasonality')
plt.xlabel('Month')
plt.ylabel('Number of Reviews (Smoothed)')
plt.legend(loc='upper right', fontsize='small')
plt.grid(True)
plt.tight_layout()
plt.show()
In [122]:
enhanced_weekly_summary['Year_Month'] = pd.to_datetime(
    enhanced_weekly_summary['Year_Week'].astype(str) + '-1',
    format='%Y-%U-%w'
).dt.to_period('M').dt.to_timestamp()

l
hotel_review_counts = (
    enhanced_weekly_summary.groupby('Hotel_Name')['Cumulative_Reviews']
    .max()
    .sort_values(ascending=False)
)


middle_start = len(hotel_review_counts) // 2 - 5
middle_10_hotels = hotel_review_counts.iloc[middle_start:middle_start + 10].index.tolist()


middle_hotels_data = enhanced_weekly_summary[
    enhanced_weekly_summary['Hotel_Name'].isin(middle_10_hotels)
]


monthly_reviews_middle = (
    middle_hotels_data.groupby(['Year_Month', 'Hotel_Name'])['Cumulative_Reviews']
    .sum()
    .reset_index()
)


plt.figure(figsize=(14, 7))

for hotel in middle_10_hotels:
    hotel_data = monthly_reviews_middle[monthly_reviews_middle['Hotel_Name'] == hotel].sort_values('Year_Month')
    hotel_data['Smoothed_Reviews'] = hotel_data['Cumulative_Reviews'].rolling(window=3, center=True, min_periods=1).mean()
    plt.plot(hotel_data['Year_Month'], hotel_data['Smoothed_Reviews'], label=hotel)


for year in enhanced_weekly_summary['Year_Month'].dt.year.unique():
    plt.axvline(pd.Timestamp(f'{year}-12-15'), color='red', linestyle='--', alpha=0.5,
                label='Christmas/New Year' if year == enhanced_weekly_summary['Year_Month'].dt.year.unique()[0] else "")
    plt.axvline(pd.Timestamp(f'{year}-07-01'), color='green', linestyle='--', alpha=0.5,
                label='Summer Peak' if year == enhanced_weekly_summary['Year_Month'].dt.year.unique()[0] else "")


plt.title('Monthly Number of Reviews (Smoothed) for Middle 10 Hotels with Seasonality')
plt.xlabel('Month')
plt.ylabel('Number of Reviews (Smoothed)')
plt.legend(loc='upper right', fontsize='small')
plt.grid(True)
plt.tight_layout()

# Save as PNG image
plt.savefig("middle_10_hotels_smoothed_reviews.png", dpi=300)
plt.show()
In [138]:
hotel_review_counts = (
    enhanced_weekly_summary.groupby('Hotel_Name')['Cumulative_Reviews']
    .max()
    .sort_values(ascending=False)
)

bottom_10_hotels = hotel_review_counts.tail(10).index.tolist()


bottom_hotels_data = enhanced_weekly_summary[enhanced_weekly_summary['Hotel_Name'].isin(bottom_10_hotels)]


monthly_reviews_bottom = (
    bottom_hotels_data.groupby(['Year_Month', 'Hotel_Name'])['Cumulative_Reviews']
    .sum()
    .reset_index()
)


plt.figure(figsize=(14, 7))

for hotel in bottom_10_hotels:
    hotel_data = monthly_reviews_bottom[monthly_reviews_bottom['Hotel_Name'] == hotel].sort_values('Year_Month')
    plt.plot(hotel_data['Year_Month'], hotel_data['Cumulative_Reviews'], label=hotel)


years = enhanced_weekly_summary['Year_Month'].dt.year.unique()

for year in years:
    
    summer_start = pd.Timestamp(f'{year}-06-01')
    summer_end = pd.Timestamp(f'{year}-08-31')
    plt.axvline(summer_start, color='green', linestyle='--', alpha=0.7)
    plt.axvline(summer_end, color='green', linestyle='--', alpha=0.7)
    plt.text(summer_start + (summer_end - summer_start) / 2, plt.ylim()[1]*0.9, 'Summer', color='green', ha='center')

    
    christmas_start = pd.Timestamp(f'{year}-12-15')
    christmas_end = pd.Timestamp(f'{year}-12-31')
    plt.axvline(christmas_start, color='red', linestyle='--', alpha=0.7)
    plt.axvline(christmas_end, color='red', linestyle='--', alpha=0.7)
    plt.text(christmas_start + (christmas_end - christmas_start) / 2, plt.ylim()[1]*0.9, 'Christmas', color='red', ha='center')

plt.title('Monthly Number of Reviews for Bottom 10 Hotels with Seasonality')
plt.xlabel('Month')
plt.ylabel('Number of Reviews')
plt.legend(loc='upper right', fontsize='small')
plt.grid(True)
plt.tight_layout()
plt.show()
In [139]:
hotel_review_counts = (
    enhanced_weekly_summary.groupby('Hotel_Name')['Cumulative_Reviews']
    .max()
    .sort_values(ascending=False)
)

top_10_hotels = hotel_review_counts.head(10).index.tolist()


top_hotels_data = enhanced_weekly_summary[enhanced_weekly_summary['Hotel_Name'].isin(top_10_hotels)]


monthly_positive_words = (
    top_hotels_data.groupby(['Year_Month', 'Hotel_Name'])['Cumulative_Avg_Positive']
    .sum()
    .reset_index()
)


plt.figure(figsize=(14, 7))

for hotel in top_10_hotels:
    hotel_data = monthly_positive_words[monthly_positive_words['Hotel_Name'] == hotel].sort_values('Year_Month')
    plt.plot(hotel_data['Year_Month'], hotel_data['Cumulative_Avg_Positive'], label=hotel)


years = enhanced_weekly_summary['Year_Month'].dt.year.unique()

for year in years:
    # Summer season
    summer_start = pd.Timestamp(f'{year}-06-01')
    summer_end = pd.Timestamp(f'{year}-08-31')
    plt.axvline(summer_start, color='green', linestyle='--', alpha=0.7)
    plt.axvline(summer_end, color='green', linestyle='--', alpha=0.7)
    plt.text(summer_start + (summer_end - summer_start) / 2, plt.ylim()[1]*0.9, 'Summer', color='green', ha='center')

   
    christmas_start = pd.Timestamp(f'{year}-12-15')
    christmas_end = pd.Timestamp(f'{year}-12-31')
    plt.axvline(christmas_start, color='red', linestyle='--', alpha=0.7)
    plt.axvline(christmas_end, color='red', linestyle='--', alpha=0.7)
    plt.text(christmas_start + (christmas_end - christmas_start) / 2, plt.ylim()[1]*0.9, 'Christmas', color='red', ha='center')

plt.title('Monthly Number of Positive Words for Top 10 Hotels with Seasonality')
plt.xlabel('Month')
plt.ylabel('Number of Positive Words')
plt.legend(loc='upper right', fontsize='small')
plt.grid(True)
plt.tight_layout()
plt.show()
In [141]:
if not pd.api.types.is_datetime64_any_dtype(enhanced_weekly_summary['Year_Week']):
  
    enhanced_weekly_summary['Year_Week'] = pd.to_datetime(
        enhanced_weekly_summary['Year_Week'].astype(str) + '-1', 
        format='%Y-%W-%w', 
        errors='coerce'  
    )


enhanced_weekly_summary['Year_Month'] = enhanced_weekly_summary['Year_Week'].dt.to_period('M').dt.to_timestamp()

# Step 3: Get top 10 hotels by total reviews
top_10_hotels = (
    enhanced_weekly_summary.groupby('Hotel_Name')['Cumulative_Reviews']
    .max()
    .sort_values(ascending=False)
    .head(10)
    .index
)


top_hotels_data = enhanced_weekly_summary[enhanced_weekly_summary['Hotel_Name'].isin(top_10_hotels)]

monthly_positive = top_hotels_data.groupby(['Year_Month', 'Hotel_Name'])['Cumulative_Avg_Positive'].sum().reset_index()


plt.figure(figsize=(14, 7))

window = 3  

for hotel in top_10_hotels:
    hotel_data = monthly_positive[monthly_positive['Hotel_Name'] == hotel]
    smoothed = hotel_data['Cumulative_Avg_Positive'].rolling(window=window, center=True).mean()
    plt.plot(hotel_data['Year_Month'], smoothed, label=hotel)


for year in [2015, 2016, 2017]:
    plt.axvline(pd.to_datetime(f'{year}-07-01'), color='green', linestyle='--', alpha=0.5)
    plt.text(pd.to_datetime(f'{year}-07-01'), plt.ylim()[1]*0.9, 'Summer', color='green', ha='center')
    plt.axvline(pd.to_datetime(f'{year}-12-20'), color='red', linestyle='--', alpha=0.5)
    plt.text(pd.to_datetime(f'{year}-12-20'), plt.ylim()[1]*0.8, 'Christmas', color='red', ha='center')

plt.title('Moving Average (3 Months) of Positive Words for Top 10 Hotels')
plt.xlabel('Month')
plt.ylabel('Smoothed Number of Positive Words')
plt.legend(loc='upper right', fontsize='small')
plt.tight_layout()
plt.show()
In [142]:
if not pd.api.types.is_datetime64_any_dtype(enhanced_weekly_summary['Year_Week']):
    enhanced_weekly_summary['Year_Week'] = pd.to_datetime(
        enhanced_weekly_summary['Year_Week'].astype(str) + '-1',
        format='%Y-%W-%w',
        errors='coerce'
    )


enhanced_weekly_summary['Year_Month'] = enhanced_weekly_summary['Year_Week'].dt.to_period('M').dt.to_timestamp()


top_10_hotels = (
    enhanced_weekly_summary.groupby('Hotel_Name')['Cumulative_Reviews']
    .max()
    .sort_values(ascending=False)
    .head(10)
    .index
)


top_hotels_data = enhanced_weekly_summary[enhanced_weekly_summary['Hotel_Name'].isin(top_10_hotels)]


monthly_negative = top_hotels_data.groupby(['Year_Month', 'Hotel_Name'])['Cumulative_Avg_Negative'].sum().reset_index()


plt.figure(figsize=(14, 7))

for hotel in top_10_hotels:
    hotel_data = monthly_negative[monthly_negative['Hotel_Name'] == hotel]
    plt.plot(hotel_data['Year_Month'], hotel_data['Cumulative_Avg_Negative'], label=hotel)


for year in [2015, 2016, 2017]:
    plt.axvline(pd.to_datetime(f'{year}-07-01'), color='green', linestyle='--', alpha=0.5)
    plt.text(pd.to_datetime(f'{year}-07-01'), plt.ylim()[1]*0.9, 'Summer', color='green', ha='center')
    plt.axvline(pd.to_datetime(f'{year}-12-20'), color='red', linestyle='--', alpha=0.5)
    plt.text(pd.to_datetime(f'{year}-12-20'), plt.ylim()[1]*0.8, 'Christmas', color='red', ha='center')

plt.title('Monthly Number of Negative Words for Top 10 Hotels with Seasonality')
plt.xlabel('Month')
plt.ylabel('Number of Negative Words')
plt.legend(loc='upper right', fontsize='small')
plt.tight_layout()
plt.show()
In [125]:
if not pd.api.types.is_datetime64_any_dtype(enhanced_weekly_summary['Year_Week']):
    enhanced_weekly_summary['Year_Week'] = pd.to_datetime(
        enhanced_weekly_summary['Year_Week'].astype(str) + '-1',
        format='%Y-%W-%w',
        errors='coerce'
    )


enhanced_weekly_summary['Year_Month'] = enhanced_weekly_summary['Year_Week'].dt.to_period('M').dt.to_timestamp()


top_10_hotels = (
    enhanced_weekly_summary.groupby('Hotel_Name')['Cumulative_Reviews']
    .max()
    .sort_values(ascending=False)
    .head(10)
    .index
)


top_hotels_data = enhanced_weekly_summary[enhanced_weekly_summary['Hotel_Name'].isin(top_10_hotels)]


monthly_negative = top_hotels_data.groupby(['Year_Month', 'Hotel_Name'])['Cumulative_Avg_Negative'].sum().reset_index()


monthly_negative['Negative_Words_SMA'] = (
    monthly_negative.groupby('Hotel_Name')['Cumulative_Avg_Negative']
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean())
)


plt.figure(figsize=(14, 7))

for hotel in top_10_hotels:
    hotel_data = monthly_negative[monthly_negative['Hotel_Name'] == hotel]
    plt.plot(hotel_data['Year_Month'], hotel_data['Negative_Words_SMA'], label=hotel)


for year in [2015, 2016, 2017]:
    plt.axvline(pd.to_datetime(f'{year}-07-01'), color='green', linestyle='--', alpha=0.5)
    plt.text(pd.to_datetime(f'{year}-07-01'), plt.ylim()[1]*0.9, 'Summer', color='green', ha='center')
    plt.axvline(pd.to_datetime(f'{year}-12-20'), color='red', linestyle='--', alpha=0.5)
    plt.text(pd.to_datetime(f'{year}-12-20'), plt.ylim()[1]*0.8, 'Christmas', color='red', ha='center')

plt.title('Monthly Number of Negative Words for Top 10 Hotels (Smoothed with Moving Average)')
plt.xlabel('Month')
plt.ylabel('Number of Negative Words (3-month Moving Avg)')
plt.legend(loc='upper right', fontsize='small')
plt.tight_layout()

plt.savefig("top_10_hotels_negative_words.png", dpi=300)
plt.show()